Updated bad docs.

Oops, forgot to re-instate the comment index during the backout.
Backed out the comments-batching change.
2025-12-16 12:53:28 +00:00 · 2013-11-29 12:57:06 +11:00 · 2013-11-29 01:42:17 +11:00 · 2013-11-29 01:12:09 +11:00 · 2013-11-29 00:18:54 +11:00 · 2013-11-28 23:51:53 +11:00
13 changed files with 916 additions and 870 deletions
--- a/.hgignore
+++ b/.hgignore
@@ -22,3 +22,6 @@ tutorial/.*$

 # ignore the downloaded logos
 ^python/media/images/logos/.*
+
+# PyCharm project files
+^.idea/
--- a/List-StackdumpCommands.ps1
+++ b/List-StackdumpCommands.ps1
--- a/Run-StackdumpCommand.ps1
+++ b/Run-StackdumpCommand.ps1
--- a/Start-Python.ps1
+++ b/Start-Python.ps1
--- a/Start-Solr.ps1
+++ b/Start-Solr.ps1
--- a/Start-StackdumpWeb.ps1
+++ b/Start-StackdumpWeb.ps1
--- a/java/solr/dist/solr-4.5.0.war
+++ b/java/solr/dist/solr-4.5.0.war
--- a/python/packages/pysolr.py
+++ b/python/packages/pysolr.py
@@ -246,6 +246,11 @@ class Solr(object):
    Optionally accepts ``timeout`` for wait seconds until giving up on a
    request. Default is ``60`` seconds.

+    Optionally accepts ``assume_clean`` to skip cleaning request of invalid XML
+    characters. This offers a slight performance improvement, but only set this
+    to ``True`` if you know your request is clean (e.g. coming from other XML
+    data). Bad things will happen otherwise. Default is ``False``.
+
    Usage::

        solr = pysolr.Solr('http://localhost:8983/solr')
@@ -253,10 +258,11 @@ class Solr(object):
        solr = pysolr.Solr('http://localhost:8983/solr', timeout=10)

    """
-    def __init__(self, url, decoder=None, timeout=60):
+    def __init__(self, url, decoder=None, timeout=60, assume_clean=False):
        self.decoder = decoder or json.JSONDecoder()
        self.url = url
        self.timeout = timeout
+        self.assume_clean = assume_clean
        self.log = self._get_log()
        self.session = requests.Session()
        self.session.stream = False
@@ -506,7 +512,10 @@ class Solr(object):

            value = "{0}".format(value)

-        return clean_xml_string(value)
+        if self.assume_clean:
+            return value
+        else:
+            return clean_xml_string(value)

    def _to_python(self, value):
        """
--- a/python/src/stackdump/app.py
+++ b/python/src/stackdump/app.py
@@ -26,7 +26,7 @@ import html5lib
 from html5lib.filters._base import Filter as HTML5LibFilterBase
 import markdown

-from stackdump.models import Site, Badge, Comment, User
+from stackdump.models import Site, Badge, User
 from stackdump import settings

 # STATIC VARIABLES
--- a/python/src/stackdump/commands/import_site.py
+++ b/python/src/stackdump/commands/import_site.py
--- a/python/src/stackdump/default_settings.py
+++ b/python/src/stackdump/default_settings.py
@@ -18,7 +18,8 @@ SERVER_PORT = 8080
 SOLR_URL = 'http://localhost:8983/solr/stackdump/'

 import os
-DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
+DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
+TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')

 # if the website is hosted under a subpath, specify it here. It must end with a
 # slash.
--- a/python/src/stackdump/models.py
+++ b/python/src/stackdump/models.py
@@ -5,6 +5,10 @@
 from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
                      DatabaseIndex

+
+ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
+
+
 class Site(SQLObject):
    name = UnicodeCol()
    desc = UnicodeCol()
@@ -15,34 +19,23 @@ class Site(SQLObject):
    
    siteKey_index = DatabaseIndex(key, unique=True)

+
 class Badge(SQLObject):
    sourceId = IntCol()
    site = ForeignKey('Site', cascade=True)
    userId = IntCol()
    name = UnicodeCol()
-    date = DateTimeCol()
+    date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)

-class Comment(SQLObject):
-    sourceId = IntCol()
-    site = ForeignKey('Site', cascade=True)
-    postId = IntCol()
-    score = IntCol()
-    text = UnicodeCol()
-    creationDate = DateTimeCol()
-    userId = IntCol()
-    
-    siteId_postId_index = DatabaseIndex(site, postId)
-    
-    json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]

 class User(SQLObject):
    sourceId = IntCol()
    site = ForeignKey('Site', cascade=True)
    reputation = IntCol()
-    creationDate = DateTimeCol()
+    creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
    displayName = UnicodeCol()
    emailHash = UnicodeCol()
-    lastAccessDate = DateTimeCol()
+    lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
    websiteUrl = UnicodeCol()
    location = UnicodeCol()
    age = IntCol()
--- a/python/src/stackdump/settings.py
+++ b/python/src/stackdump/settings.py
@@ -19,9 +19,10 @@ from default_settings import *
 # uncomment if the default host and port for Solr is different.
 #SOLR_URL = 'http://localhost:8983/solr/stackdump/'

-# uncomment if the database for Stackdump is not the default SQLite one
-#import os
-#DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
+# uncomment if the database for Stackdump is not the default SQLite one or you
+# wish to have the database at a different path to the stackdump_root/data
+# directory
+#DATABASE_CONN_STR = 'sqlite:///' + path_to_the_database

 # if the website is hosted under a subpath, specify it here. It must end with a
 # slash.
Author	SHA1	Message	Date
Samuel Lai	4a9c4504b3	Updated bad docs.	2013-11-29 12:57:06 +11:00
Samuel Lai	77dd2def42	Oops, forgot to re-instate the comment index during the backout.	2013-11-29 01:42:17 +11:00
Samuel Lai	75a216f5a4	Backed out the comments-batching change. It was causing weird perf issues and errors. Didn't really seem like it made things faster; if anything, things became slower.	2013-11-29 01:12:09 +11:00
Samuel Lai	bf09e36928	Changed other models to avoid unnecessary date/time parsing. Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.	2013-11-29 00:18:54 +11:00
Samuel Lai	cdb8d96508	Comments are now committed in batches and using a 'prepared' statement via executemany. Also fixed a Windows compatibility bug with the new temp comments db and a bug with the webapp now that the Comment model has moved. Dates are also no longer parsed from their ISO form for comments; instead left as strings and parsed by SQLObject internally as needed.	2013-11-28 23:51:53 +11:00
Samuel Lai	8e3d21f817	Fixed settings for Windows compatibility.	2013-11-28 22:06:33 +11:00
Samuel Lai	6469691e4b	Added PowerShell equivalents to launch and manage Stackdump on Windows.	2013-11-28 21:53:45 +11:00
Samuel Lai	65394ac516	More minor fixes. Really should get Stackdump set-up on my dev machine.	2013-11-28 15:07:05 +11:00
Samuel Lai	bcf1d7c71a	Again. Forgot to fix site->siteId rename.	2013-11-28 14:39:25 +11:00
Samuel Lai	d36146ae46	More bugs - forgot to rename uses when renaming Comment.site to siteId	2013-11-28 14:38:21 +11:00
Samuel Lai	e1272ce58a	Oops, bug with closing temp_db file handle.	2013-11-28 14:35:24 +11:00
Samuel Lai	bff7e13d83	Comment data used during importing is now stored in a separate database to make it easier to delete them afterwards.	2013-11-28 14:23:55 +11:00
Samuel Lai	c0766de8d4	Skips valid XML character scrubbing if configured for faster performance.	2013-11-28 14:01:00 +11:00
Samuel Lai	644269dd5d	Added PyCharm project files to the ignore list.	2013-11-28 13:54:47 +11:00
Sam	6bbf0d7b28	Removed a big duplicate file in Solr.	2013-10-22 23:36:46 +11:00
Sam	71c875437e	Added tag v1.1 for changeset 3ad1ff15b528	2013-10-22 23:21:20 +11:00