Updated bad docs.

Oops, forgot to re-instate the comment index during the backout.
Backed out the comments-batching change.
2025-12-17 05:13:32 +00:00 · 2013-11-29 12:57:06 +11:00 · 2013-11-29 01:42:17 +11:00 · 2013-11-29 01:12:09 +11:00 · 2013-11-29 00:18:54 +11:00 · 2013-11-28 23:51:53 +11:00
13 changed files with 916 additions and 870 deletions
--- a/.hgignore
+++ b/.hgignore
@@ -22,3 +22,6 @@ tutorial/.*$

 # ignore the downloaded logos
 ^python/media/images/logos/.*
+
+# PyCharm project files
+^.idea/
--- a/List-StackdumpCommands.ps1
+++ b/List-StackdumpCommands.ps1
--- a/Run-StackdumpCommand.ps1
+++ b/Run-StackdumpCommand.ps1
--- a/Start-Python.ps1
+++ b/Start-Python.ps1
--- a/Start-Solr.ps1
+++ b/Start-Solr.ps1
--- a/Start-StackdumpWeb.ps1
+++ b/Start-StackdumpWeb.ps1
--- a/java/solr/dist/solr-4.5.0.war
+++ b/java/solr/dist/solr-4.5.0.war
--- a/python/packages/pysolr.py
+++ b/python/packages/pysolr.py
@@ -246,6 +246,11 @@ class Solr(object):
    Optionally accepts ``timeout`` for wait seconds until giving up on a
    request. Default is ``60`` seconds.

+    Optionally accepts ``assume_clean`` to skip cleaning request of invalid XML
+    characters. This offers a slight performance improvement, but only set this
+    to ``True`` if you know your request is clean (e.g. coming from other XML
+    data). Bad things will happen otherwise. Default is ``False``.
+
    Usage::

        solr = pysolr.Solr('http://localhost:8983/solr')
@@ -253,10 +258,11 @@ class Solr(object):
        solr = pysolr.Solr('http://localhost:8983/solr', timeout=10)

    """
-    def __init__(self, url, decoder=None, timeout=60):
+    def __init__(self, url, decoder=None, timeout=60, assume_clean=False):
        self.decoder = decoder or json.JSONDecoder()
        self.url = url
        self.timeout = timeout
+        self.assume_clean = assume_clean
        self.log = self._get_log()
        self.session = requests.Session()
        self.session.stream = False
@@ -506,7 +512,10 @@ class Solr(object):

            value = "{0}".format(value)

-        return clean_xml_string(value)
+        if self.assume_clean:
+            return value
+        else:
+            return clean_xml_string(value)

    def _to_python(self, value):
        """
--- a/python/src/stackdump/app.py
+++ b/python/src/stackdump/app.py
@@ -26,7 +26,7 @@ import html5lib
 from html5lib.filters._base import Filter as HTML5LibFilterBase
 import markdown

-from stackdump.models import Site, Badge, Comment, User
+from stackdump.models import Site, Badge, User
 from stackdump import settings

 # STATIC VARIABLES
--- a/python/src/stackdump/commands/import_site.py
+++ b/python/src/stackdump/commands/import_site.py
@@ -12,15 +12,17 @@ from datetime import datetime
 import re
 import urllib2
 import socket
+import tempfile
 from optparse import OptionParser
 from xml.etree import ElementTree

-from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObject
+from sqlobject import sqlhub, connectionForURI, AND, IN, SQLObject, \
+    UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
 from sqlobject.sqlbuilder import Delete, Insert
 from sqlobject.styles import DefaultStyle
 from pysolr import Solr

-from stackdump.models import Site, Badge, Comment, User
+from stackdump.models import Site, Badge, User
 from stackdump import settings

 try:
@@ -108,7 +110,7 @@ class BadgeContentHandler(BaseContentHandler):
            d['sourceId'] = int(attrs['Id'])
            d['userId'] = int(attrs.get('UserId', 0))
            d['name'] = attrs.get('Name', '')
-            d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
+            d['date'] = attrs.get('Date')
        except Exception, e:
            # could not parse this, so ignore the row completely
            self.cur_props = None
@@ -135,12 +137,12 @@ class CommentContentHandler(BaseContentHandler):
            return
        
        try:
-            d = self.cur_props = { 'site' : self.site }
+            d = self.cur_props = { 'siteId' : self.site.id }
            d['sourceId'] = int(attrs['Id'])
            d['postId'] = int(attrs.get('PostId', 0))
            d['score'] = int(attrs.get('Score', 0))
            d['text'] = attrs.get('Text', '')
-            d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
+            d['creationDate'] = attrs.get('CreationDate')
            d['userId'] = int(attrs.get('UserId', 0))
            
        except Exception, e:
@@ -181,10 +183,10 @@ class UserContentHandler(BaseContentHandler):
            d = self.cur_props = { 'site' : self.site }
            d['sourceId'] = int(attrs['Id'])
            d['reputation'] = int(attrs.get('Reputation', 0))
-            d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
+            d['creationDate'] = attrs.get('CreationDate')
            d['displayName'] = attrs.get('DisplayName', '')
            d['emailHash'] = attrs.get('EmailHash', '')
-            d['lastAccessDate'] = datetime.strptime(attrs.get('LastAccessDate'), ISO_DATE_FORMAT)
+            d['lastAccessDate'] = attrs.get('LastAccessDate')
            d['websiteUrl'] = attrs.get('WebsiteUrl', '')
            d['location'] = attrs.get('Location', '')
            d['age'] = int(attrs.get('Age', 0))
@@ -342,8 +344,9 @@ class PostContentHandler(xml.sax.ContentHandler):
        if self.row_count % 1000 == 0:
            print('%-10s Processed %d rows.' % ('[post]', self.row_count))
        
-        # only check for finished questions every 1000 rows to speed things up
-        if self.row_count % 1000 == 0:
+        # only check for finished questions every 10000 rows to speed things up
+        if self.row_count % 10000 == 0:
+            print('Committing completed questions...')
            self.commit_finished_questions()
    
    def commit_finished_questions(self):
@@ -400,7 +403,7 @@ class PostContentHandler(xml.sax.ContentHandler):
            post_ids.add(a['id'])
        
        # get the comments
-        comment_objs = Comment.select(AND(Comment.q.site == self.site,
+        comment_objs = Comment.select(AND(Comment.q.siteId == self.site.id,
                                          IN(Comment.q.postId, list(post_ids))))
        
        # sort the comments out into a dict keyed on the post id
@@ -514,7 +517,10 @@ class PostContentHandler(xml.sax.ContentHandler):
    
    def commit_questions(self, questions, commit=True):
        """
-        Commits the given list of questions to solr.
+        Adds the given list of questions to solr.
+
+        By default, they are committed immediately. Set the ``commit`` argument
+        to False to disable this behaviour.
        """
        self.solr.add(questions, commit=commit)
    
@@ -551,6 +557,25 @@ class PostContentHandler(xml.sax.ContentHandler):
        for question_id, answers in self.orphan_answers.items():
            print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))

+
+# TEMP COMMENT DATABASE DEFINITION
+comment_db_sqlhub = dbconnection.ConnectionHub()
+class Comment(SQLObject):
+    sourceId = IntCol()
+    siteId = IntCol()
+    postId = IntCol()
+    score = IntCol()
+    text = UnicodeCol()
+    creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
+    userId = IntCol()
+
+    siteId_postId_index = DatabaseIndex(siteId, postId)
+
+    _connection = comment_db_sqlhub
+
+    json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
+
+
 # METHODS
 def get_file_path(dir_path, filename):
    """
@@ -593,14 +618,14 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
        sys.exit(1)

    # connect to the database
-    print('Connecting to the database...')
+    print('Connecting to the Stackdump database...')
    conn_str = settings.DATABASE_CONN_STR
    sqlhub.processConnection = connectionForURI(conn_str)
    print('Connected.\n')

    # connect to solr
    print('Connecting to solr...')
-    solr = Solr(settings.SOLR_URL)
+    solr = Solr(settings.SOLR_URL, assume_clean=True)
    # pysolr doesn't try to connect until a request is made, so we'll make a ping request
    try:
        solr._send_request('GET', 'admin/ping')
@@ -614,7 +639,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
    print("Creating tables if they don't exist...")
    Site.createTable(ifNotExists=True)
    Badge.createTable(ifNotExists=True)
-    Comment.createTable(ifNotExists=True)
    User.createTable(ifNotExists=True)
    print('Created.\n')

@@ -742,8 +766,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
        sqlhub.threadConnection = sqlhub.processConnection.transaction()
        conn = sqlhub.threadConnection
        # these deletions are done in this order to avoid FK constraint issues
-        print('\tDeleting comments...')
-        conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site==site))))
        print('\tDeleting badges...')
        conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
        print('\tDeleting users...')
@@ -758,11 +780,26 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
        solr.commit(expungeDeletes=True)
        print('Deleted.\n')

+    # create the temporary comments database
+    print('Connecting to the temporary comments database...')
+    temp_db_file, temp_db_path = tempfile.mkstemp('.sqlite', 'temp_comment_db-' + re.sub(r'[^\w]', '_', site_key) + '-', settings.TEMP_COMMENTS_DATABASE_DIR)
+    os.close(temp_db_file)
+    conn_str = 'sqlite:///' + temp_db_path
+    comment_db_sqlhub.processConnection = connectionForURI(conn_str)
+    print('Connected.')
+    Comment.createTable()
+    print('Schema created.')
+    comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
+    comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
+    print('Pragma configured.\n')
+
    timing_start = time.time()

    # start a new transaction
    sqlhub.threadConnection = sqlhub.processConnection.transaction()
    conn = sqlhub.threadConnection
+    comment_db_sqlhub.threadConnection = comment_db_sqlhub.processConnection.transaction()
+    temp_db_conn = comment_db_sqlhub.threadConnection

    # create a new Site
    site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
@@ -785,7 +822,7 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
    print('[comment] PARSING COMMENTS...')
    xml_path = get_file_path(xml_root, 'comments.xml')
    print('[comment] start parsing comments.xml...')
-    handler = CommentContentHandler(conn, site)
+    handler = CommentContentHandler(temp_db_conn, site)
    xml.sax.parse(xml_path, handler)
    print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
    print('[comment] FINISHED PARSING COMMENTS.\n')
@@ -812,8 +849,10 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
    print('[post] FINISHED PARSING POSTS.\n')

    # DELETE COMMENTS
-    print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
-    conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
+    print('[comment] DELETING TEMPORARY COMMENTS DATABASE (they are no longer needed)...')
+    temp_db_conn.commit(close=True)
+    comment_db_sqlhub.processConnection.close()
+    os.remove(temp_db_path)
    print('[comment] FINISHED DELETING COMMENTS.\n')

    # commit transaction
--- a/python/src/stackdump/default_settings.py
+++ b/python/src/stackdump/default_settings.py
@@ -18,7 +18,8 @@ SERVER_PORT = 8080
 SOLR_URL = 'http://localhost:8983/solr/stackdump/'

 import os
-DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
+DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
+TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')

 # if the website is hosted under a subpath, specify it here. It must end with a
 # slash.
--- a/python/src/stackdump/models.py
+++ b/python/src/stackdump/models.py
@@ -5,6 +5,10 @@
 from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
                      DatabaseIndex

+
+ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
+
+
 class Site(SQLObject):
    name = UnicodeCol()
    desc = UnicodeCol()
@@ -15,34 +19,23 @@ class Site(SQLObject):
    
    siteKey_index = DatabaseIndex(key, unique=True)

+
 class Badge(SQLObject):
    sourceId = IntCol()
    site = ForeignKey('Site', cascade=True)
    userId = IntCol()
    name = UnicodeCol()
-    date = DateTimeCol()
+    date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)

-class Comment(SQLObject):
-    sourceId = IntCol()
-    site = ForeignKey('Site', cascade=True)
-    postId = IntCol()
-    score = IntCol()
-    text = UnicodeCol()
-    creationDate = DateTimeCol()
-    userId = IntCol()
-    
-    siteId_postId_index = DatabaseIndex(site, postId)
-    
-    json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]

 class User(SQLObject):
    sourceId = IntCol()
    site = ForeignKey('Site', cascade=True)
    reputation = IntCol()
-    creationDate = DateTimeCol()
+    creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
    displayName = UnicodeCol()
    emailHash = UnicodeCol()
-    lastAccessDate = DateTimeCol()
+    lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
    websiteUrl = UnicodeCol()
    location = UnicodeCol()
    age = IntCol()
--- a/python/src/stackdump/settings.py
+++ b/python/src/stackdump/settings.py
@@ -19,9 +19,10 @@ from default_settings import *
 # uncomment if the default host and port for Solr is different.
 #SOLR_URL = 'http://localhost:8983/solr/stackdump/'

-# uncomment if the database for Stackdump is not the default SQLite one
-#import os
-#DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
+# uncomment if the database for Stackdump is not the default SQLite one or you
+# wish to have the database at a different path to the stackdump_root/data
+# directory
+#DATABASE_CONN_STR = 'sqlite:///' + path_to_the_database

 # if the website is hosted under a subpath, specify it here. It must end with a
 # slash.
Author	SHA1	Message	Date
Samuel Lai	4a9c4504b3	Updated bad docs.	2013-11-29 12:57:06 +11:00
Samuel Lai	77dd2def42	Oops, forgot to re-instate the comment index during the backout.	2013-11-29 01:42:17 +11:00
Samuel Lai	75a216f5a4	Backed out the comments-batching change. It was causing weird perf issues and errors. Didn't really seem like it made things faster; if anything, things became slower.	2013-11-29 01:12:09 +11:00
Samuel Lai	bf09e36928	Changed other models to avoid unnecessary date/time parsing. Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.	2013-11-29 00:18:54 +11:00
Samuel Lai	cdb8d96508	Comments are now committed in batches and using a 'prepared' statement via executemany. Also fixed a Windows compatibility bug with the new temp comments db and a bug with the webapp now that the Comment model has moved. Dates are also no longer parsed from their ISO form for comments; instead left as strings and parsed by SQLObject internally as needed.	2013-11-28 23:51:53 +11:00
Samuel Lai	8e3d21f817	Fixed settings for Windows compatibility.	2013-11-28 22:06:33 +11:00
Samuel Lai	6469691e4b	Added PowerShell equivalents to launch and manage Stackdump on Windows.	2013-11-28 21:53:45 +11:00
Samuel Lai	65394ac516	More minor fixes. Really should get Stackdump set-up on my dev machine.	2013-11-28 15:07:05 +11:00
Samuel Lai	bcf1d7c71a	Again. Forgot to fix site->siteId rename.	2013-11-28 14:39:25 +11:00
Samuel Lai	d36146ae46	More bugs - forgot to rename uses when renaming Comment.site to siteId	2013-11-28 14:38:21 +11:00
Samuel Lai	e1272ce58a	Oops, bug with closing temp_db file handle.	2013-11-28 14:35:24 +11:00
Samuel Lai	bff7e13d83	Comment data used during importing is now stored in a separate database to make it easier to delete them afterwards.	2013-11-28 14:23:55 +11:00
Samuel Lai	c0766de8d4	Skips valid XML character scrubbing if configured for faster performance.	2013-11-28 14:01:00 +11:00
Samuel Lai	644269dd5d	Added PyCharm project files to the ignore list.	2013-11-28 13:54:47 +11:00
Sam	6bbf0d7b28	Removed a big duplicate file in Solr.	2013-10-22 23:36:46 +11:00
Sam	71c875437e	Added tag v1.1 for changeset 3ad1ff15b528	2013-10-22 23:21:20 +11:00