1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-01-22 06:31:38 +00:00

Merge import-perf-improvements branch to default.

This commit is contained in:
Samuel Lai 2013-11-29 13:01:41 +11:00
commit a597b2e588
5 changed files with 908 additions and 866 deletions

View File

@ -246,6 +246,11 @@ class Solr(object):
Optionally accepts ``timeout`` for wait seconds until giving up on a
request. Default is ``60`` seconds.
Optionally accepts ``assume_clean`` to skip cleaning request of invalid XML
characters. This offers a slight performance improvement, but only set this
to ``True`` if you know your request is clean (e.g. coming from other XML
data). Bad things will happen otherwise. Default is ``False``.
Usage::
solr = pysolr.Solr('http://localhost:8983/solr')
@ -253,10 +258,11 @@ class Solr(object):
solr = pysolr.Solr('http://localhost:8983/solr', timeout=10)
"""
def __init__(self, url, decoder=None, timeout=60):
def __init__(self, url, decoder=None, timeout=60, assume_clean=False):
self.decoder = decoder or json.JSONDecoder()
self.url = url
self.timeout = timeout
self.assume_clean = assume_clean
self.log = self._get_log()
self.session = requests.Session()
self.session.stream = False
@ -506,7 +512,10 @@ class Solr(object):
value = "{0}".format(value)
return clean_xml_string(value)
if self.assume_clean:
return value
else:
return clean_xml_string(value)
def _to_python(self, value):
"""

View File

@ -26,7 +26,7 @@ import html5lib
from html5lib.filters._base import Filter as HTML5LibFilterBase
import markdown
from stackdump.models import Site, Badge, Comment, User
from stackdump.models import Site, Badge, User
from stackdump import settings
# STATIC VARIABLES

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,7 @@ SOLR_URL = 'http://localhost:8983/solr/stackdump/'
import os
DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')
# if the website is hosted under a subpath, specify it here. It must end with a
# slash.

View File

@ -5,6 +5,10 @@
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
DatabaseIndex
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
class Site(SQLObject):
name = UnicodeCol()
desc = UnicodeCol()
@ -15,34 +19,23 @@ class Site(SQLObject):
siteKey_index = DatabaseIndex(key, unique=True)
class Badge(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
userId = IntCol()
name = UnicodeCol()
date = DateTimeCol()
date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
class Comment(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
postId = IntCol()
score = IntCol()
text = UnicodeCol()
creationDate = DateTimeCol()
userId = IntCol()
siteId_postId_index = DatabaseIndex(site, postId)
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
class User(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
reputation = IntCol()
creationDate = DateTimeCol()
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
displayName = UnicodeCol()
emailHash = UnicodeCol()
lastAccessDate = DateTimeCol()
lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
websiteUrl = UnicodeCol()
location = UnicodeCol()
age = IntCol()