1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-12-16 12:53:28 +00:00

16 Commits

Author SHA1 Message Date
Samuel Lai
4a9c4504b3 Updated bad docs. 2013-11-29 12:57:06 +11:00
Samuel Lai
77dd2def42 Oops, forgot to re-instate the comment index during the backout. 2013-11-29 01:42:17 +11:00
Samuel Lai
75a216f5a4 Backed out the comments-batching change.
It was causing weird perf issues and errors. Didn't really seem like it made things faster; if anything, things became slower.
2013-11-29 01:12:09 +11:00
Samuel Lai
bf09e36928 Changed other models to avoid unnecessary date/time parsing.
Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.
2013-11-29 00:18:54 +11:00
Samuel Lai
cdb8d96508 Comments are now committed in batches and using a 'prepared' statement via executemany.
Also fixed a Windows compatibility bug with the new temp comments db and a bug with the webapp now that the Comment model has moved. Dates are also no longer parsed from their ISO form for comments; instead left as strings and parsed by SQLObject internally as needed.
2013-11-28 23:51:53 +11:00
Samuel Lai
8e3d21f817 Fixed settings for Windows compatibility. 2013-11-28 22:06:33 +11:00
Samuel Lai
6469691e4b Added PowerShell equivalents to launch and manage Stackdump on Windows. 2013-11-28 21:53:45 +11:00
Samuel Lai
65394ac516 More minor fixes. Really should get Stackdump set-up on my dev machine. 2013-11-28 15:07:05 +11:00
Samuel Lai
bcf1d7c71a Again. Forgot to fix site->siteId rename. 2013-11-28 14:39:25 +11:00
Samuel Lai
d36146ae46 More bugs - forgot to rename uses when renaming Comment.site to siteId 2013-11-28 14:38:21 +11:00
Samuel Lai
e1272ce58a Oops, bug with closing temp_db file handle. 2013-11-28 14:35:24 +11:00
Samuel Lai
bff7e13d83 Comment data used during importing is now stored in a separate database to make it easier to delete them afterwards. 2013-11-28 14:23:55 +11:00
Samuel Lai
c0766de8d4 Skips valid XML character scrubbing if configured for faster performance. 2013-11-28 14:01:00 +11:00
Samuel Lai
644269dd5d Added PyCharm project files to the ignore list. 2013-11-28 13:54:47 +11:00
Sam
6bbf0d7b28 Removed a big duplicate file in Solr. 2013-10-22 23:36:46 +11:00
Sam
71c875437e Added tag v1.1 for changeset 3ad1ff15b528 2013-10-22 23:21:20 +11:00
13 changed files with 916 additions and 870 deletions

View File

@@ -22,3 +22,6 @@ tutorial/.*$
# ignore the downloaded logos
^python/media/images/logos/.*
# PyCharm project files
^.idea/

BIN
List-StackdumpCommands.ps1 Normal file

Binary file not shown.

BIN
Run-StackdumpCommand.ps1 Normal file

Binary file not shown.

BIN
Start-Python.ps1 Normal file

Binary file not shown.

BIN
Start-Solr.ps1 Normal file

Binary file not shown.

BIN
Start-StackdumpWeb.ps1 Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -246,6 +246,11 @@ class Solr(object):
Optionally accepts ``timeout`` for wait seconds until giving up on a
request. Default is ``60`` seconds.
Optionally accepts ``assume_clean`` to skip cleaning request of invalid XML
characters. This offers a slight performance improvement, but only set this
to ``True`` if you know your request is clean (e.g. coming from other XML
data). Bad things will happen otherwise. Default is ``False``.
Usage::
solr = pysolr.Solr('http://localhost:8983/solr')
@@ -253,10 +258,11 @@ class Solr(object):
solr = pysolr.Solr('http://localhost:8983/solr', timeout=10)
"""
def __init__(self, url, decoder=None, timeout=60):
def __init__(self, url, decoder=None, timeout=60, assume_clean=False):
self.decoder = decoder or json.JSONDecoder()
self.url = url
self.timeout = timeout
self.assume_clean = assume_clean
self.log = self._get_log()
self.session = requests.Session()
self.session.stream = False
@@ -506,7 +512,10 @@ class Solr(object):
value = "{0}".format(value)
return clean_xml_string(value)
if self.assume_clean:
return value
else:
return clean_xml_string(value)
def _to_python(self, value):
"""

View File

@@ -26,7 +26,7 @@ import html5lib
from html5lib.filters._base import Filter as HTML5LibFilterBase
import markdown
from stackdump.models import Site, Badge, Comment, User
from stackdump.models import Site, Badge, User
from stackdump import settings
# STATIC VARIABLES

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,8 @@ SERVER_PORT = 8080
SOLR_URL = 'http://localhost:8983/solr/stackdump/'
import os
DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')
# if the website is hosted under a subpath, specify it here. It must end with a
# slash.

View File

@@ -5,6 +5,10 @@
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
DatabaseIndex
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
class Site(SQLObject):
name = UnicodeCol()
desc = UnicodeCol()
@@ -15,34 +19,23 @@ class Site(SQLObject):
siteKey_index = DatabaseIndex(key, unique=True)
class Badge(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
userId = IntCol()
name = UnicodeCol()
date = DateTimeCol()
date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
class Comment(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
postId = IntCol()
score = IntCol()
text = UnicodeCol()
creationDate = DateTimeCol()
userId = IntCol()
siteId_postId_index = DatabaseIndex(site, postId)
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
class User(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
reputation = IntCol()
creationDate = DateTimeCol()
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
displayName = UnicodeCol()
emailHash = UnicodeCol()
lastAccessDate = DateTimeCol()
lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
websiteUrl = UnicodeCol()
location = UnicodeCol()
age = IntCol()

View File

@@ -19,9 +19,10 @@ from default_settings import *
# uncomment if the default host and port for Solr is different.
#SOLR_URL = 'http://localhost:8983/solr/stackdump/'
# uncomment if the database for Stackdump is not the default SQLite one
#import os
#DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__)
# uncomment if the database for Stackdump is not the default SQLite one or you
# wish to have the database at a different path to the stackdump_root/data
# directory
#DATABASE_CONN_STR = 'sqlite:///' + path_to_the_database
# if the website is hosted under a subpath, specify it here. It must end with a
# slash.