mirror of
https://github.com/djohnlewis/stackdump
synced 2025-04-05 01:03:27 +00:00
Removed the Jython hacks. We're going with CPython only now.
This commit is contained in:
parent
59ab86dd59
commit
61579cb807
Binary file not shown.
Binary file not shown.
@ -8,7 +8,6 @@ import sys
|
|||||||
import os
|
import os
|
||||||
import xml.sax
|
import xml.sax
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import platform
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from sqlobject import *
|
from sqlobject import *
|
||||||
@ -21,12 +20,9 @@ except ImportError:
|
|||||||
# For Python >= 2.6
|
# For Python >= 2.6
|
||||||
import json
|
import json
|
||||||
|
|
||||||
is_jython = 'Java' in platform.system()
|
|
||||||
script_dir = os.path.dirname(sys.argv[0])
|
script_dir = os.path.dirname(sys.argv[0])
|
||||||
|
|
||||||
# MODELS
|
# MODELS
|
||||||
# use UnicodeCol instead of StringCol; StringCol defaults to ascii when encoding
|
|
||||||
# is unspecified, as it is with Jython zxJDBC.
|
|
||||||
class Site(SQLObject):
|
class Site(SQLObject):
|
||||||
name = UnicodeCol()
|
name = UnicodeCol()
|
||||||
desc = UnicodeCol()
|
desc = UnicodeCol()
|
||||||
@ -66,11 +62,7 @@ class User(SQLObject):
|
|||||||
downVotes = IntCol()
|
downVotes = IntCol()
|
||||||
|
|
||||||
# SAX HANDLERS
|
# SAX HANDLERS
|
||||||
# Jython can't handle the %f format specifier
|
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
|
||||||
if is_jython:
|
|
||||||
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
|
|
||||||
else:
|
|
||||||
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
|
|
||||||
|
|
||||||
class BaseContentHandler(xml.sax.ContentHandler):
|
class BaseContentHandler(xml.sax.ContentHandler):
|
||||||
"""
|
"""
|
||||||
@ -124,8 +116,7 @@ class BadgeContentHandler(BaseContentHandler):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
d = self.cur_props = { 'site' : self.site }
|
d = self.cur_props = { 'site' : self.site }
|
||||||
# this hack to get the Id attr is needed due to Jython bug #1768
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
|
|
||||||
d['userId'] = int(attrs.get('UserId', 0))
|
d['userId'] = int(attrs.get('UserId', 0))
|
||||||
d['name'] = attrs.get('Name', '')
|
d['name'] = attrs.get('Name', '')
|
||||||
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
|
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
|
||||||
@ -156,8 +147,7 @@ class CommentContentHandler(BaseContentHandler):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
d = self.cur_props = { 'site' : self.site }
|
d = self.cur_props = { 'site' : self.site }
|
||||||
# this hack to get the Id attr is needed due to Jython bug #1768
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
|
|
||||||
d['postId'] = int(attrs.get('PostId', 0))
|
d['postId'] = int(attrs.get('PostId', 0))
|
||||||
d['score'] = int(attrs.get('Score', 0))
|
d['score'] = int(attrs.get('Score', 0))
|
||||||
d['text'] = attrs.get('Text', '')
|
d['text'] = attrs.get('Text', '')
|
||||||
@ -200,8 +190,7 @@ class UserContentHandler(BaseContentHandler):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
d = self.cur_props = { 'site' : site }
|
d = self.cur_props = { 'site' : site }
|
||||||
# this hack to get the Id attr is needed due to Jython bug #1768
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
|
|
||||||
d['reputation'] = int(attrs.get('Reputation', 0))
|
d['reputation'] = int(attrs.get('Reputation', 0))
|
||||||
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
|
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
|
||||||
d['displayName'] = attrs.get('DisplayName', '')
|
d['displayName'] = attrs.get('DisplayName', '')
|
||||||
@ -268,8 +257,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
d = self.cur_props = { }
|
d = self.cur_props = { }
|
||||||
# this hack to get the Id attr is needed due to Jython bug #1768
|
d['id'] = int(attrs['Id'])
|
||||||
d['id'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
|
|
||||||
|
|
||||||
if attrs['PostTypeId'] == '2':
|
if attrs['PostTypeId'] == '2':
|
||||||
# I am an answer.
|
# I am an answer.
|
||||||
@ -521,12 +509,8 @@ db_path = os.path.abspath(os.path.join(script_dir, '../../data/stackdump.sqlite'
|
|||||||
|
|
||||||
# connect to the database
|
# connect to the database
|
||||||
print('Connecting to the database...')
|
print('Connecting to the database...')
|
||||||
if is_jython:
|
conn_str = 'sqlite://' + db_path
|
||||||
conn_str = 'jython_sqlite://' + db_path
|
|
||||||
else: # assume cPython
|
|
||||||
conn_str = 'sqlite://' + db_path
|
|
||||||
sqlhub.processConnection = connectionForURI(conn_str)
|
sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
#sqlhub.processConnection = connectionForURI('jython_sqlite://:memory:')
|
|
||||||
print('Connected.\n')
|
print('Connected.\n')
|
||||||
|
|
||||||
# connect to solr
|
# connect to solr
|
||||||
|
@ -1,466 +0,0 @@
|
|||||||
r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
|
|
||||||
JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
|
|
||||||
interchange format.
|
|
||||||
|
|
||||||
:mod:`simplejson` exposes an API familiar to users of the standard library
|
|
||||||
:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
|
|
||||||
version of the :mod:`json` library contained in Python 2.6, but maintains
|
|
||||||
compatibility with Python 2.4 and Python 2.5 and (currently) has
|
|
||||||
significant performance advantages, even without using the optional C
|
|
||||||
extension for speedups.
|
|
||||||
|
|
||||||
Encoding basic Python object hierarchies::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
|
|
||||||
'["foo", {"bar": ["baz", null, 1.0, 2]}]'
|
|
||||||
>>> print json.dumps("\"foo\bar")
|
|
||||||
"\"foo\bar"
|
|
||||||
>>> print json.dumps(u'\u1234')
|
|
||||||
"\u1234"
|
|
||||||
>>> print json.dumps('\\')
|
|
||||||
"\\"
|
|
||||||
>>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
|
|
||||||
{"a": 0, "b": 0, "c": 0}
|
|
||||||
>>> from StringIO import StringIO
|
|
||||||
>>> io = StringIO()
|
|
||||||
>>> json.dump(['streaming API'], io)
|
|
||||||
>>> io.getvalue()
|
|
||||||
'["streaming API"]'
|
|
||||||
|
|
||||||
Compact encoding::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
|
|
||||||
'[1,2,3,{"4":5,"6":7}]'
|
|
||||||
|
|
||||||
Pretty printing::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=' ')
|
|
||||||
>>> print '\n'.join([l.rstrip() for l in s.splitlines()])
|
|
||||||
{
|
|
||||||
"4": 5,
|
|
||||||
"6": 7
|
|
||||||
}
|
|
||||||
|
|
||||||
Decoding JSON::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
|
|
||||||
>>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
|
|
||||||
True
|
|
||||||
>>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
|
|
||||||
True
|
|
||||||
>>> from StringIO import StringIO
|
|
||||||
>>> io = StringIO('["streaming API"]')
|
|
||||||
>>> json.load(io)[0] == 'streaming API'
|
|
||||||
True
|
|
||||||
|
|
||||||
Specializing JSON object decoding::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> def as_complex(dct):
|
|
||||||
... if '__complex__' in dct:
|
|
||||||
... return complex(dct['real'], dct['imag'])
|
|
||||||
... return dct
|
|
||||||
...
|
|
||||||
>>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
|
|
||||||
... object_hook=as_complex)
|
|
||||||
(1+2j)
|
|
||||||
>>> from decimal import Decimal
|
|
||||||
>>> json.loads('1.1', parse_float=Decimal) == Decimal('1.1')
|
|
||||||
True
|
|
||||||
|
|
||||||
Specializing JSON object encoding::
|
|
||||||
|
|
||||||
>>> import simplejson as json
|
|
||||||
>>> def encode_complex(obj):
|
|
||||||
... if isinstance(obj, complex):
|
|
||||||
... return [obj.real, obj.imag]
|
|
||||||
... raise TypeError(repr(o) + " is not JSON serializable")
|
|
||||||
...
|
|
||||||
>>> json.dumps(2 + 1j, default=encode_complex)
|
|
||||||
'[2.0, 1.0]'
|
|
||||||
>>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
|
|
||||||
'[2.0, 1.0]'
|
|
||||||
>>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
|
|
||||||
'[2.0, 1.0]'
|
|
||||||
|
|
||||||
|
|
||||||
Using simplejson.tool from the shell to validate and pretty-print::
|
|
||||||
|
|
||||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
|
||||||
{
|
|
||||||
"json": "obj"
|
|
||||||
}
|
|
||||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
|
||||||
Expecting property name: line 1 column 2 (char 2)
|
|
||||||
"""
|
|
||||||
__version__ = '2.2.1'
|
|
||||||
__all__ = [
|
|
||||||
'dump', 'dumps', 'load', 'loads',
|
|
||||||
'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
|
|
||||||
'OrderedDict',
|
|
||||||
]
|
|
||||||
|
|
||||||
__author__ = 'Bob Ippolito <bob@redivi.com>'
|
|
||||||
|
|
||||||
from decimal import Decimal
|
|
||||||
|
|
||||||
from decoder import JSONDecoder, JSONDecodeError
|
|
||||||
from encoder import JSONEncoder
|
|
||||||
def _import_OrderedDict():
|
|
||||||
import collections
|
|
||||||
try:
|
|
||||||
return collections.OrderedDict
|
|
||||||
except AttributeError:
|
|
||||||
import ordered_dict
|
|
||||||
return ordered_dict.OrderedDict
|
|
||||||
OrderedDict = _import_OrderedDict()
|
|
||||||
|
|
||||||
def _import_c_make_encoder():
|
|
||||||
try:
|
|
||||||
from simplejson._speedups import make_encoder
|
|
||||||
return make_encoder
|
|
||||||
except ImportError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
_default_encoder = JSONEncoder(
|
|
||||||
skipkeys=False,
|
|
||||||
ensure_ascii=True,
|
|
||||||
check_circular=True,
|
|
||||||
allow_nan=True,
|
|
||||||
indent=None,
|
|
||||||
separators=None,
|
|
||||||
encoding='utf-8',
|
|
||||||
default=None,
|
|
||||||
use_decimal=True,
|
|
||||||
namedtuple_as_object=True,
|
|
||||||
tuple_as_array=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
|
|
||||||
allow_nan=True, cls=None, indent=None, separators=None,
|
|
||||||
encoding='utf-8', default=None, use_decimal=True,
|
|
||||||
namedtuple_as_object=True, tuple_as_array=True,
|
|
||||||
**kw):
|
|
||||||
"""Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
|
|
||||||
``.write()``-supporting file-like object).
|
|
||||||
|
|
||||||
If ``skipkeys`` is true then ``dict`` keys that are not basic types
|
|
||||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
|
||||||
will be skipped instead of raising a ``TypeError``.
|
|
||||||
|
|
||||||
If ``ensure_ascii`` is false, then the some chunks written to ``fp``
|
|
||||||
may be ``unicode`` instances, subject to normal Python ``str`` to
|
|
||||||
``unicode`` coercion rules. Unless ``fp.write()`` explicitly
|
|
||||||
understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
|
|
||||||
to cause an error.
|
|
||||||
|
|
||||||
If ``check_circular`` is false, then the circular reference check
|
|
||||||
for container types will be skipped and a circular reference will
|
|
||||||
result in an ``OverflowError`` (or worse).
|
|
||||||
|
|
||||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
|
||||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
|
|
||||||
in strict compliance of the JSON specification, instead of using the
|
|
||||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
|
||||||
|
|
||||||
If *indent* is a string, then JSON array elements and object members
|
|
||||||
will be pretty-printed with a newline followed by that string repeated
|
|
||||||
for each level of nesting. ``None`` (the default) selects the most compact
|
|
||||||
representation without any newlines. For backwards compatibility with
|
|
||||||
versions of simplejson earlier than 2.1.0, an integer is also accepted
|
|
||||||
and is converted to a string with that many spaces.
|
|
||||||
|
|
||||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
|
||||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
|
||||||
``(',', ':')`` is the most compact JSON representation.
|
|
||||||
|
|
||||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
|
||||||
|
|
||||||
``default(obj)`` is a function that should return a serializable version
|
|
||||||
of obj or raise TypeError. The default simply raises TypeError.
|
|
||||||
|
|
||||||
If *use_decimal* is true (default: ``True``) then decimal.Decimal
|
|
||||||
will be natively serialized to JSON with full precision.
|
|
||||||
|
|
||||||
If *namedtuple_as_object* is true (default: ``True``),
|
|
||||||
:class:`tuple` subclasses with ``_asdict()`` methods will be encoded
|
|
||||||
as JSON objects.
|
|
||||||
|
|
||||||
If *tuple_as_array* is true (default: ``True``),
|
|
||||||
:class:`tuple` (and subclasses) will be encoded as JSON arrays.
|
|
||||||
|
|
||||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
|
||||||
``.default()`` method to serialize additional types), specify it with
|
|
||||||
the ``cls`` kwarg.
|
|
||||||
|
|
||||||
"""
|
|
||||||
# cached encoder
|
|
||||||
if (not skipkeys and ensure_ascii and
|
|
||||||
check_circular and allow_nan and
|
|
||||||
cls is None and indent is None and separators is None and
|
|
||||||
encoding == 'utf-8' and default is None and use_decimal
|
|
||||||
and namedtuple_as_object and tuple_as_array and not kw):
|
|
||||||
iterable = _default_encoder.iterencode(obj)
|
|
||||||
else:
|
|
||||||
if cls is None:
|
|
||||||
cls = JSONEncoder
|
|
||||||
iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
|
||||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
|
||||||
separators=separators, encoding=encoding,
|
|
||||||
default=default, use_decimal=use_decimal,
|
|
||||||
namedtuple_as_object=namedtuple_as_object,
|
|
||||||
tuple_as_array=tuple_as_array,
|
|
||||||
**kw).iterencode(obj)
|
|
||||||
# could accelerate with writelines in some versions of Python, at
|
|
||||||
# a debuggability cost
|
|
||||||
for chunk in iterable:
|
|
||||||
fp.write(chunk)
|
|
||||||
|
|
||||||
|
|
||||||
def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
|
|
||||||
allow_nan=True, cls=None, indent=None, separators=None,
|
|
||||||
encoding='utf-8', default=None, use_decimal=True,
|
|
||||||
namedtuple_as_object=True,
|
|
||||||
tuple_as_array=True,
|
|
||||||
**kw):
|
|
||||||
"""Serialize ``obj`` to a JSON formatted ``str``.
|
|
||||||
|
|
||||||
If ``skipkeys`` is false then ``dict`` keys that are not basic types
|
|
||||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
|
||||||
will be skipped instead of raising a ``TypeError``.
|
|
||||||
|
|
||||||
If ``ensure_ascii`` is false, then the return value will be a
|
|
||||||
``unicode`` instance subject to normal Python ``str`` to ``unicode``
|
|
||||||
coercion rules instead of being escaped to an ASCII ``str``.
|
|
||||||
|
|
||||||
If ``check_circular`` is false, then the circular reference check
|
|
||||||
for container types will be skipped and a circular reference will
|
|
||||||
result in an ``OverflowError`` (or worse).
|
|
||||||
|
|
||||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
|
||||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
|
|
||||||
strict compliance of the JSON specification, instead of using the
|
|
||||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
|
||||||
|
|
||||||
If ``indent`` is a string, then JSON array elements and object members
|
|
||||||
will be pretty-printed with a newline followed by that string repeated
|
|
||||||
for each level of nesting. ``None`` (the default) selects the most compact
|
|
||||||
representation without any newlines. For backwards compatibility with
|
|
||||||
versions of simplejson earlier than 2.1.0, an integer is also accepted
|
|
||||||
and is converted to a string with that many spaces.
|
|
||||||
|
|
||||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
|
||||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
|
||||||
``(',', ':')`` is the most compact JSON representation.
|
|
||||||
|
|
||||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
|
||||||
|
|
||||||
``default(obj)`` is a function that should return a serializable version
|
|
||||||
of obj or raise TypeError. The default simply raises TypeError.
|
|
||||||
|
|
||||||
If *use_decimal* is true (default: ``True``) then decimal.Decimal
|
|
||||||
will be natively serialized to JSON with full precision.
|
|
||||||
|
|
||||||
If *namedtuple_as_object* is true (default: ``True``),
|
|
||||||
:class:`tuple` subclasses with ``_asdict()`` methods will be encoded
|
|
||||||
as JSON objects.
|
|
||||||
|
|
||||||
If *tuple_as_array* is true (default: ``True``),
|
|
||||||
:class:`tuple` (and subclasses) will be encoded as JSON arrays.
|
|
||||||
|
|
||||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
|
||||||
``.default()`` method to serialize additional types), specify it with
|
|
||||||
the ``cls`` kwarg.
|
|
||||||
|
|
||||||
"""
|
|
||||||
# cached encoder
|
|
||||||
if (not skipkeys and ensure_ascii and
|
|
||||||
check_circular and allow_nan and
|
|
||||||
cls is None and indent is None and separators is None and
|
|
||||||
encoding == 'utf-8' and default is None and use_decimal
|
|
||||||
and namedtuple_as_object and tuple_as_array and not kw):
|
|
||||||
return _default_encoder.encode(obj)
|
|
||||||
if cls is None:
|
|
||||||
cls = JSONEncoder
|
|
||||||
return cls(
|
|
||||||
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
|
||||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
|
||||||
separators=separators, encoding=encoding, default=default,
|
|
||||||
use_decimal=use_decimal,
|
|
||||||
namedtuple_as_object=namedtuple_as_object,
|
|
||||||
tuple_as_array=tuple_as_array,
|
|
||||||
**kw).encode(obj)
|
|
||||||
|
|
||||||
|
|
||||||
_default_decoder = JSONDecoder(encoding=None, object_hook=None,
|
|
||||||
object_pairs_hook=None)
|
|
||||||
|
|
||||||
|
|
||||||
def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
|
|
||||||
parse_int=None, parse_constant=None, object_pairs_hook=None,
|
|
||||||
use_decimal=False, namedtuple_as_object=True, tuple_as_array=True,
|
|
||||||
**kw):
|
|
||||||
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
|
|
||||||
a JSON document) to a Python object.
|
|
||||||
|
|
||||||
*encoding* determines the encoding used to interpret any
|
|
||||||
:class:`str` objects decoded by this instance (``'utf-8'`` by
|
|
||||||
default). It has no effect when decoding :class:`unicode` objects.
|
|
||||||
|
|
||||||
Note that currently only encodings that are a superset of ASCII work,
|
|
||||||
strings of other encodings should be passed in as :class:`unicode`.
|
|
||||||
|
|
||||||
*object_hook*, if specified, will be called with the result of every
|
|
||||||
JSON object decoded and its return value will be used in place of the
|
|
||||||
given :class:`dict`. This can be used to provide custom
|
|
||||||
deserializations (e.g. to support JSON-RPC class hinting).
|
|
||||||
|
|
||||||
*object_pairs_hook* is an optional function that will be called with
|
|
||||||
the result of any object literal decode with an ordered list of pairs.
|
|
||||||
The return value of *object_pairs_hook* will be used instead of the
|
|
||||||
:class:`dict`. This feature can be used to implement custom decoders
|
|
||||||
that rely on the order that the key and value pairs are decoded (for
|
|
||||||
example, :func:`collections.OrderedDict` will remember the order of
|
|
||||||
insertion). If *object_hook* is also defined, the *object_pairs_hook*
|
|
||||||
takes priority.
|
|
||||||
|
|
||||||
*parse_float*, if specified, will be called with the string of every
|
|
||||||
JSON float to be decoded. By default, this is equivalent to
|
|
||||||
``float(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON floats (e.g. :class:`decimal.Decimal`).
|
|
||||||
|
|
||||||
*parse_int*, if specified, will be called with the string of every
|
|
||||||
JSON int to be decoded. By default, this is equivalent to
|
|
||||||
``int(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON integers (e.g. :class:`float`).
|
|
||||||
|
|
||||||
*parse_constant*, if specified, will be called with one of the
|
|
||||||
following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This
|
|
||||||
can be used to raise an exception if invalid JSON numbers are
|
|
||||||
encountered.
|
|
||||||
|
|
||||||
If *use_decimal* is true (default: ``False``) then it implies
|
|
||||||
parse_float=decimal.Decimal for parity with ``dump``.
|
|
||||||
|
|
||||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
|
||||||
kwarg.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return loads(fp.read(),
|
|
||||||
encoding=encoding, cls=cls, object_hook=object_hook,
|
|
||||||
parse_float=parse_float, parse_int=parse_int,
|
|
||||||
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,
|
|
||||||
use_decimal=use_decimal, **kw)
|
|
||||||
|
|
||||||
|
|
||||||
def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
|
|
||||||
parse_int=None, parse_constant=None, object_pairs_hook=None,
|
|
||||||
use_decimal=False, **kw):
|
|
||||||
"""Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
|
|
||||||
document) to a Python object.
|
|
||||||
|
|
||||||
*encoding* determines the encoding used to interpret any
|
|
||||||
:class:`str` objects decoded by this instance (``'utf-8'`` by
|
|
||||||
default). It has no effect when decoding :class:`unicode` objects.
|
|
||||||
|
|
||||||
Note that currently only encodings that are a superset of ASCII work,
|
|
||||||
strings of other encodings should be passed in as :class:`unicode`.
|
|
||||||
|
|
||||||
*object_hook*, if specified, will be called with the result of every
|
|
||||||
JSON object decoded and its return value will be used in place of the
|
|
||||||
given :class:`dict`. This can be used to provide custom
|
|
||||||
deserializations (e.g. to support JSON-RPC class hinting).
|
|
||||||
|
|
||||||
*object_pairs_hook* is an optional function that will be called with
|
|
||||||
the result of any object literal decode with an ordered list of pairs.
|
|
||||||
The return value of *object_pairs_hook* will be used instead of the
|
|
||||||
:class:`dict`. This feature can be used to implement custom decoders
|
|
||||||
that rely on the order that the key and value pairs are decoded (for
|
|
||||||
example, :func:`collections.OrderedDict` will remember the order of
|
|
||||||
insertion). If *object_hook* is also defined, the *object_pairs_hook*
|
|
||||||
takes priority.
|
|
||||||
|
|
||||||
*parse_float*, if specified, will be called with the string of every
|
|
||||||
JSON float to be decoded. By default, this is equivalent to
|
|
||||||
``float(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON floats (e.g. :class:`decimal.Decimal`).
|
|
||||||
|
|
||||||
*parse_int*, if specified, will be called with the string of every
|
|
||||||
JSON int to be decoded. By default, this is equivalent to
|
|
||||||
``int(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON integers (e.g. :class:`float`).
|
|
||||||
|
|
||||||
*parse_constant*, if specified, will be called with one of the
|
|
||||||
following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This
|
|
||||||
can be used to raise an exception if invalid JSON numbers are
|
|
||||||
encountered.
|
|
||||||
|
|
||||||
If *use_decimal* is true (default: ``False``) then it implies
|
|
||||||
parse_float=decimal.Decimal for parity with ``dump``.
|
|
||||||
|
|
||||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
|
||||||
kwarg.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if (cls is None and encoding is None and object_hook is None and
|
|
||||||
parse_int is None and parse_float is None and
|
|
||||||
parse_constant is None and object_pairs_hook is None
|
|
||||||
and not use_decimal and not kw):
|
|
||||||
return _default_decoder.decode(s)
|
|
||||||
if cls is None:
|
|
||||||
cls = JSONDecoder
|
|
||||||
if object_hook is not None:
|
|
||||||
kw['object_hook'] = object_hook
|
|
||||||
if object_pairs_hook is not None:
|
|
||||||
kw['object_pairs_hook'] = object_pairs_hook
|
|
||||||
if parse_float is not None:
|
|
||||||
kw['parse_float'] = parse_float
|
|
||||||
if parse_int is not None:
|
|
||||||
kw['parse_int'] = parse_int
|
|
||||||
if parse_constant is not None:
|
|
||||||
kw['parse_constant'] = parse_constant
|
|
||||||
if use_decimal:
|
|
||||||
if parse_float is not None:
|
|
||||||
raise TypeError("use_decimal=True implies parse_float=Decimal")
|
|
||||||
kw['parse_float'] = Decimal
|
|
||||||
return cls(encoding=encoding, **kw).decode(s)
|
|
||||||
|
|
||||||
|
|
||||||
def _toggle_speedups(enabled):
|
|
||||||
import simplejson.decoder as dec
|
|
||||||
import simplejson.encoder as enc
|
|
||||||
import simplejson.scanner as scan
|
|
||||||
c_make_encoder = _import_c_make_encoder()
|
|
||||||
if enabled:
|
|
||||||
dec.scanstring = dec.c_scanstring or dec.py_scanstring
|
|
||||||
enc.c_make_encoder = c_make_encoder
|
|
||||||
enc.encode_basestring_ascii = (enc.c_encode_basestring_ascii or
|
|
||||||
enc.py_encode_basestring_ascii)
|
|
||||||
scan.make_scanner = scan.c_make_scanner or scan.py_make_scanner
|
|
||||||
else:
|
|
||||||
dec.scanstring = dec.py_scanstring
|
|
||||||
enc.c_make_encoder = None
|
|
||||||
enc.encode_basestring_ascii = enc.py_encode_basestring_ascii
|
|
||||||
scan.make_scanner = scan.py_make_scanner
|
|
||||||
dec.make_scanner = scan.make_scanner
|
|
||||||
global _default_decoder
|
|
||||||
_default_decoder = JSONDecoder(
|
|
||||||
encoding=None,
|
|
||||||
object_hook=None,
|
|
||||||
object_pairs_hook=None,
|
|
||||||
)
|
|
||||||
global _default_encoder
|
|
||||||
_default_encoder = JSONEncoder(
|
|
||||||
skipkeys=False,
|
|
||||||
ensure_ascii=True,
|
|
||||||
check_circular=True,
|
|
||||||
allow_nan=True,
|
|
||||||
indent=None,
|
|
||||||
separators=None,
|
|
||||||
encoding='utf-8',
|
|
||||||
default=None,
|
|
||||||
)
|
|
File diff suppressed because it is too large
Load Diff
@ -1,421 +0,0 @@
|
|||||||
"""Implementation of JSONDecoder
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import struct
|
|
||||||
|
|
||||||
from simplejson.scanner import make_scanner
|
|
||||||
def _import_c_scanstring():
|
|
||||||
try:
|
|
||||||
from simplejson._speedups import scanstring
|
|
||||||
return scanstring
|
|
||||||
except ImportError:
|
|
||||||
return None
|
|
||||||
c_scanstring = _import_c_scanstring()
|
|
||||||
|
|
||||||
__all__ = ['JSONDecoder']
|
|
||||||
|
|
||||||
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
|
|
||||||
|
|
||||||
def _floatconstants():
|
|
||||||
_BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
|
|
||||||
# The struct module in Python 2.4 would get frexp() out of range here
|
|
||||||
# when an endian is specified in the format string. Fixed in Python 2.5+
|
|
||||||
if sys.byteorder != 'big':
|
|
||||||
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
|
|
||||||
nan, inf = struct.unpack('dd', _BYTES)
|
|
||||||
return nan, inf, -inf
|
|
||||||
|
|
||||||
NaN, PosInf, NegInf = _floatconstants()
|
|
||||||
|
|
||||||
|
|
||||||
class JSONDecodeError(ValueError):
|
|
||||||
"""Subclass of ValueError with the following additional properties:
|
|
||||||
|
|
||||||
msg: The unformatted error message
|
|
||||||
doc: The JSON document being parsed
|
|
||||||
pos: The start index of doc where parsing failed
|
|
||||||
end: The end index of doc where parsing failed (may be None)
|
|
||||||
lineno: The line corresponding to pos
|
|
||||||
colno: The column corresponding to pos
|
|
||||||
endlineno: The line corresponding to end (may be None)
|
|
||||||
endcolno: The column corresponding to end (may be None)
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self, msg, doc, pos, end=None):
|
|
||||||
ValueError.__init__(self, errmsg(msg, doc, pos, end=end))
|
|
||||||
self.msg = msg
|
|
||||||
self.doc = doc
|
|
||||||
self.pos = pos
|
|
||||||
self.end = end
|
|
||||||
self.lineno, self.colno = linecol(doc, pos)
|
|
||||||
if end is not None:
|
|
||||||
self.endlineno, self.endcolno = linecol(doc, end)
|
|
||||||
else:
|
|
||||||
self.endlineno, self.endcolno = None, None
|
|
||||||
|
|
||||||
|
|
||||||
def linecol(doc, pos):
|
|
||||||
lineno = doc.count('\n', 0, pos) + 1
|
|
||||||
if lineno == 1:
|
|
||||||
colno = pos
|
|
||||||
else:
|
|
||||||
colno = pos - doc.rindex('\n', 0, pos)
|
|
||||||
return lineno, colno
|
|
||||||
|
|
||||||
|
|
||||||
def errmsg(msg, doc, pos, end=None):
|
|
||||||
# Note that this function is called from _speedups
|
|
||||||
lineno, colno = linecol(doc, pos)
|
|
||||||
if end is None:
|
|
||||||
#fmt = '{0}: line {1} column {2} (char {3})'
|
|
||||||
#return fmt.format(msg, lineno, colno, pos)
|
|
||||||
fmt = '%s: line %d column %d (char %d)'
|
|
||||||
return fmt % (msg, lineno, colno, pos)
|
|
||||||
endlineno, endcolno = linecol(doc, end)
|
|
||||||
#fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
|
|
||||||
#return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
|
|
||||||
fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
|
|
||||||
return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
|
|
||||||
|
|
||||||
|
|
||||||
_CONSTANTS = {
|
|
||||||
'-Infinity': NegInf,
|
|
||||||
'Infinity': PosInf,
|
|
||||||
'NaN': NaN,
|
|
||||||
}
|
|
||||||
|
|
||||||
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
|
|
||||||
BACKSLASH = {
|
|
||||||
'"': u'"', '\\': u'\\', '/': u'/',
|
|
||||||
'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_ENCODING = "utf-8"
|
|
||||||
|
|
||||||
def py_scanstring(s, end, encoding=None, strict=True,
|
|
||||||
_b=BACKSLASH, _m=STRINGCHUNK.match):
|
|
||||||
"""Scan the string s for a JSON string. End is the index of the
|
|
||||||
character in s after the quote that started the JSON string.
|
|
||||||
Unescapes all valid JSON string escape sequences and raises ValueError
|
|
||||||
on attempt to decode an invalid string. If strict is False then literal
|
|
||||||
control characters are allowed in the string.
|
|
||||||
|
|
||||||
Returns a tuple of the decoded string and the index of the character in s
|
|
||||||
after the end quote."""
|
|
||||||
if encoding is None:
|
|
||||||
encoding = DEFAULT_ENCODING
|
|
||||||
chunks = []
|
|
||||||
_append = chunks.append
|
|
||||||
begin = end - 1
|
|
||||||
while 1:
|
|
||||||
chunk = _m(s, end)
|
|
||||||
if chunk is None:
|
|
||||||
raise JSONDecodeError(
|
|
||||||
"Unterminated string starting at", s, begin)
|
|
||||||
end = chunk.end()
|
|
||||||
content, terminator = chunk.groups()
|
|
||||||
# Content is contains zero or more unescaped string characters
|
|
||||||
if content:
|
|
||||||
if not isinstance(content, unicode):
|
|
||||||
content = unicode(content, encoding)
|
|
||||||
_append(content)
|
|
||||||
# Terminator is the end of string, a literal control character,
|
|
||||||
# or a backslash denoting that an escape sequence follows
|
|
||||||
if terminator == '"':
|
|
||||||
break
|
|
||||||
elif terminator != '\\':
|
|
||||||
if strict:
|
|
||||||
msg = "Invalid control character %r at" % (terminator,)
|
|
||||||
#msg = "Invalid control character {0!r} at".format(terminator)
|
|
||||||
raise JSONDecodeError(msg, s, end)
|
|
||||||
else:
|
|
||||||
_append(terminator)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
esc = s[end]
|
|
||||||
except IndexError:
|
|
||||||
raise JSONDecodeError(
|
|
||||||
"Unterminated string starting at", s, begin)
|
|
||||||
# If not a unicode escape sequence, must be in the lookup table
|
|
||||||
if esc != 'u':
|
|
||||||
try:
|
|
||||||
char = _b[esc]
|
|
||||||
except KeyError:
|
|
||||||
msg = "Invalid \\escape: " + repr(esc)
|
|
||||||
raise JSONDecodeError(msg, s, end)
|
|
||||||
end += 1
|
|
||||||
else:
|
|
||||||
# Unicode escape sequence
|
|
||||||
esc = s[end + 1:end + 5]
|
|
||||||
next_end = end + 5
|
|
||||||
if len(esc) != 4:
|
|
||||||
msg = "Invalid \\uXXXX escape"
|
|
||||||
raise JSONDecodeError(msg, s, end)
|
|
||||||
uni = int(esc, 16)
|
|
||||||
# Check for surrogate pair on UCS-4 systems
|
|
||||||
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
|
|
||||||
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
|
|
||||||
if not s[end + 5:end + 7] == '\\u':
|
|
||||||
raise JSONDecodeError(msg, s, end)
|
|
||||||
esc2 = s[end + 7:end + 11]
|
|
||||||
if len(esc2) != 4:
|
|
||||||
raise JSONDecodeError(msg, s, end)
|
|
||||||
uni2 = int(esc2, 16)
|
|
||||||
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
|
||||||
next_end += 6
|
|
||||||
char = unichr(uni)
|
|
||||||
end = next_end
|
|
||||||
# Append the unescaped character
|
|
||||||
_append(char)
|
|
||||||
return u''.join(chunks), end
|
|
||||||
|
|
||||||
|
|
||||||
# Use speedup if available
|
|
||||||
scanstring = c_scanstring or py_scanstring
|
|
||||||
|
|
||||||
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
|
|
||||||
WHITESPACE_STR = ' \t\n\r'
|
|
||||||
|
|
||||||
def JSONObject((s, end), encoding, strict, scan_once, object_hook,
|
|
||||||
object_pairs_hook, memo=None,
|
|
||||||
_w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
|
||||||
# Backwards compatibility
|
|
||||||
if memo is None:
|
|
||||||
memo = {}
|
|
||||||
memo_get = memo.setdefault
|
|
||||||
pairs = []
|
|
||||||
# Use a slice to prevent IndexError from being raised, the following
|
|
||||||
# check will raise a more specific ValueError if the string is empty
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
# Normally we expect nextchar == '"'
|
|
||||||
if nextchar != '"':
|
|
||||||
if nextchar in _ws:
|
|
||||||
end = _w(s, end).end()
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
# Trivial empty object
|
|
||||||
if nextchar == '}':
|
|
||||||
if object_pairs_hook is not None:
|
|
||||||
result = object_pairs_hook(pairs)
|
|
||||||
return result, end + 1
|
|
||||||
pairs = {}
|
|
||||||
if object_hook is not None:
|
|
||||||
pairs = object_hook(pairs)
|
|
||||||
return pairs, end + 1
|
|
||||||
elif nextchar != '"':
|
|
||||||
raise JSONDecodeError("Expecting property name", s, end)
|
|
||||||
end += 1
|
|
||||||
while True:
|
|
||||||
key, end = scanstring(s, end, encoding, strict)
|
|
||||||
key = memo_get(key, key)
|
|
||||||
|
|
||||||
# To skip some function call overhead we optimize the fast paths where
|
|
||||||
# the JSON key separator is ": " or just ":".
|
|
||||||
if s[end:end + 1] != ':':
|
|
||||||
end = _w(s, end).end()
|
|
||||||
if s[end:end + 1] != ':':
|
|
||||||
raise JSONDecodeError("Expecting : delimiter", s, end)
|
|
||||||
|
|
||||||
end += 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
if s[end] in _ws:
|
|
||||||
end += 1
|
|
||||||
if s[end] in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
value, end = scan_once(s, end)
|
|
||||||
except StopIteration:
|
|
||||||
raise JSONDecodeError("Expecting object", s, end)
|
|
||||||
pairs.append((key, value))
|
|
||||||
|
|
||||||
try:
|
|
||||||
nextchar = s[end]
|
|
||||||
if nextchar in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
nextchar = s[end]
|
|
||||||
except IndexError:
|
|
||||||
nextchar = ''
|
|
||||||
end += 1
|
|
||||||
|
|
||||||
if nextchar == '}':
|
|
||||||
break
|
|
||||||
elif nextchar != ',':
|
|
||||||
raise JSONDecodeError("Expecting , delimiter", s, end - 1)
|
|
||||||
|
|
||||||
try:
|
|
||||||
nextchar = s[end]
|
|
||||||
if nextchar in _ws:
|
|
||||||
end += 1
|
|
||||||
nextchar = s[end]
|
|
||||||
if nextchar in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
nextchar = s[end]
|
|
||||||
except IndexError:
|
|
||||||
nextchar = ''
|
|
||||||
|
|
||||||
end += 1
|
|
||||||
if nextchar != '"':
|
|
||||||
raise JSONDecodeError("Expecting property name", s, end - 1)
|
|
||||||
|
|
||||||
if object_pairs_hook is not None:
|
|
||||||
result = object_pairs_hook(pairs)
|
|
||||||
return result, end
|
|
||||||
pairs = dict(pairs)
|
|
||||||
if object_hook is not None:
|
|
||||||
pairs = object_hook(pairs)
|
|
||||||
return pairs, end
|
|
||||||
|
|
||||||
def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
|
||||||
values = []
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
if nextchar in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
# Look-ahead for trivial empty array
|
|
||||||
if nextchar == ']':
|
|
||||||
return values, end + 1
|
|
||||||
_append = values.append
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
value, end = scan_once(s, end)
|
|
||||||
except StopIteration:
|
|
||||||
raise JSONDecodeError("Expecting object", s, end)
|
|
||||||
_append(value)
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
if nextchar in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
nextchar = s[end:end + 1]
|
|
||||||
end += 1
|
|
||||||
if nextchar == ']':
|
|
||||||
break
|
|
||||||
elif nextchar != ',':
|
|
||||||
raise JSONDecodeError("Expecting , delimiter", s, end)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if s[end] in _ws:
|
|
||||||
end += 1
|
|
||||||
if s[end] in _ws:
|
|
||||||
end = _w(s, end + 1).end()
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return values, end
|
|
||||||
|
|
||||||
class JSONDecoder(object):
|
|
||||||
"""Simple JSON <http://json.org> decoder
|
|
||||||
|
|
||||||
Performs the following translations in decoding by default:
|
|
||||||
|
|
||||||
+---------------+-------------------+
|
|
||||||
| JSON | Python |
|
|
||||||
+===============+===================+
|
|
||||||
| object | dict |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| array | list |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| string | unicode |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| number (int) | int, long |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| number (real) | float |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| true | True |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| false | False |
|
|
||||||
+---------------+-------------------+
|
|
||||||
| null | None |
|
|
||||||
+---------------+-------------------+
|
|
||||||
|
|
||||||
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
|
|
||||||
their corresponding ``float`` values, which is outside the JSON spec.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, encoding=None, object_hook=None, parse_float=None,
|
|
||||||
parse_int=None, parse_constant=None, strict=True,
|
|
||||||
object_pairs_hook=None):
|
|
||||||
"""
|
|
||||||
*encoding* determines the encoding used to interpret any
|
|
||||||
:class:`str` objects decoded by this instance (``'utf-8'`` by
|
|
||||||
default). It has no effect when decoding :class:`unicode` objects.
|
|
||||||
|
|
||||||
Note that currently only encodings that are a superset of ASCII work,
|
|
||||||
strings of other encodings should be passed in as :class:`unicode`.
|
|
||||||
|
|
||||||
*object_hook*, if specified, will be called with the result of every
|
|
||||||
JSON object decoded and its return value will be used in place of the
|
|
||||||
given :class:`dict`. This can be used to provide custom
|
|
||||||
deserializations (e.g. to support JSON-RPC class hinting).
|
|
||||||
|
|
||||||
*object_pairs_hook* is an optional function that will be called with
|
|
||||||
the result of any object literal decode with an ordered list of pairs.
|
|
||||||
The return value of *object_pairs_hook* will be used instead of the
|
|
||||||
:class:`dict`. This feature can be used to implement custom decoders
|
|
||||||
that rely on the order that the key and value pairs are decoded (for
|
|
||||||
example, :func:`collections.OrderedDict` will remember the order of
|
|
||||||
insertion). If *object_hook* is also defined, the *object_pairs_hook*
|
|
||||||
takes priority.
|
|
||||||
|
|
||||||
*parse_float*, if specified, will be called with the string of every
|
|
||||||
JSON float to be decoded. By default, this is equivalent to
|
|
||||||
``float(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON floats (e.g. :class:`decimal.Decimal`).
|
|
||||||
|
|
||||||
*parse_int*, if specified, will be called with the string of every
|
|
||||||
JSON int to be decoded. By default, this is equivalent to
|
|
||||||
``int(num_str)``. This can be used to use another datatype or parser
|
|
||||||
for JSON integers (e.g. :class:`float`).
|
|
||||||
|
|
||||||
*parse_constant*, if specified, will be called with one of the
|
|
||||||
following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This
|
|
||||||
can be used to raise an exception if invalid JSON numbers are
|
|
||||||
encountered.
|
|
||||||
|
|
||||||
*strict* controls the parser's behavior when it encounters an
|
|
||||||
invalid control character in a string. The default setting of
|
|
||||||
``True`` means that unescaped control characters are parse errors, if
|
|
||||||
``False`` then control characters will be allowed in strings.
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.encoding = encoding
|
|
||||||
self.object_hook = object_hook
|
|
||||||
self.object_pairs_hook = object_pairs_hook
|
|
||||||
self.parse_float = parse_float or float
|
|
||||||
self.parse_int = parse_int or int
|
|
||||||
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
|
|
||||||
self.strict = strict
|
|
||||||
self.parse_object = JSONObject
|
|
||||||
self.parse_array = JSONArray
|
|
||||||
self.parse_string = scanstring
|
|
||||||
self.memo = {}
|
|
||||||
self.scan_once = make_scanner(self)
|
|
||||||
|
|
||||||
def decode(self, s, _w=WHITESPACE.match):
|
|
||||||
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
|
|
||||||
instance containing a JSON document)
|
|
||||||
|
|
||||||
"""
|
|
||||||
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
|
|
||||||
end = _w(s, end).end()
|
|
||||||
if end != len(s):
|
|
||||||
raise JSONDecodeError("Extra data", s, end, len(s))
|
|
||||||
return obj
|
|
||||||
|
|
||||||
def raw_decode(self, s, idx=0):
|
|
||||||
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode``
|
|
||||||
beginning with a JSON document) and return a 2-tuple of the Python
|
|
||||||
representation and the index in ``s`` where the document ended.
|
|
||||||
|
|
||||||
This can be used to decode a JSON document from a string that may
|
|
||||||
have extraneous data at the end.
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
obj, end = self.scan_once(s, idx)
|
|
||||||
except StopIteration:
|
|
||||||
raise JSONDecodeError("No JSON object could be decoded", s, idx)
|
|
||||||
return obj, end
|
|
File diff suppressed because it is too large
Load Diff
@ -1,119 +0,0 @@
|
|||||||
"""Drop-in replacement for collections.OrderedDict by Raymond Hettinger
|
|
||||||
|
|
||||||
http://code.activestate.com/recipes/576693/
|
|
||||||
|
|
||||||
"""
|
|
||||||
from UserDict import DictMixin
|
|
||||||
|
|
||||||
# Modified from original to support Python 2.4, see
|
|
||||||
# http://code.google.com/p/simplejson/issues/detail?id=53
|
|
||||||
try:
|
|
||||||
all
|
|
||||||
except NameError:
|
|
||||||
def all(seq):
|
|
||||||
for elem in seq:
|
|
||||||
if not elem:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
class OrderedDict(dict, DictMixin):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwds):
|
|
||||||
if len(args) > 1:
|
|
||||||
raise TypeError('expected at most 1 arguments, got %d' % len(args))
|
|
||||||
try:
|
|
||||||
self.__end
|
|
||||||
except AttributeError:
|
|
||||||
self.clear()
|
|
||||||
self.update(*args, **kwds)
|
|
||||||
|
|
||||||
def clear(self):
|
|
||||||
self.__end = end = []
|
|
||||||
end += [None, end, end] # sentinel node for doubly linked list
|
|
||||||
self.__map = {} # key --> [key, prev, next]
|
|
||||||
dict.clear(self)
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
if key not in self:
|
|
||||||
end = self.__end
|
|
||||||
curr = end[1]
|
|
||||||
curr[2] = end[1] = self.__map[key] = [key, curr, end]
|
|
||||||
dict.__setitem__(self, key, value)
|
|
||||||
|
|
||||||
def __delitem__(self, key):
|
|
||||||
dict.__delitem__(self, key)
|
|
||||||
key, prev, next = self.__map.pop(key)
|
|
||||||
prev[2] = next
|
|
||||||
next[1] = prev
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
end = self.__end
|
|
||||||
curr = end[2]
|
|
||||||
while curr is not end:
|
|
||||||
yield curr[0]
|
|
||||||
curr = curr[2]
|
|
||||||
|
|
||||||
def __reversed__(self):
|
|
||||||
end = self.__end
|
|
||||||
curr = end[1]
|
|
||||||
while curr is not end:
|
|
||||||
yield curr[0]
|
|
||||||
curr = curr[1]
|
|
||||||
|
|
||||||
def popitem(self, last=True):
|
|
||||||
if not self:
|
|
||||||
raise KeyError('dictionary is empty')
|
|
||||||
# Modified from original to support Python 2.4, see
|
|
||||||
# http://code.google.com/p/simplejson/issues/detail?id=53
|
|
||||||
if last:
|
|
||||||
key = reversed(self).next()
|
|
||||||
else:
|
|
||||||
key = iter(self).next()
|
|
||||||
value = self.pop(key)
|
|
||||||
return key, value
|
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
items = [[k, self[k]] for k in self]
|
|
||||||
tmp = self.__map, self.__end
|
|
||||||
del self.__map, self.__end
|
|
||||||
inst_dict = vars(self).copy()
|
|
||||||
self.__map, self.__end = tmp
|
|
||||||
if inst_dict:
|
|
||||||
return (self.__class__, (items,), inst_dict)
|
|
||||||
return self.__class__, (items,)
|
|
||||||
|
|
||||||
def keys(self):
|
|
||||||
return list(self)
|
|
||||||
|
|
||||||
setdefault = DictMixin.setdefault
|
|
||||||
update = DictMixin.update
|
|
||||||
pop = DictMixin.pop
|
|
||||||
values = DictMixin.values
|
|
||||||
items = DictMixin.items
|
|
||||||
iterkeys = DictMixin.iterkeys
|
|
||||||
itervalues = DictMixin.itervalues
|
|
||||||
iteritems = DictMixin.iteritems
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
if not self:
|
|
||||||
return '%s()' % (self.__class__.__name__,)
|
|
||||||
return '%s(%r)' % (self.__class__.__name__, self.items())
|
|
||||||
|
|
||||||
def copy(self):
|
|
||||||
return self.__class__(self)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def fromkeys(cls, iterable, value=None):
|
|
||||||
d = cls()
|
|
||||||
for key in iterable:
|
|
||||||
d[key] = value
|
|
||||||
return d
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
if isinstance(other, OrderedDict):
|
|
||||||
return len(self)==len(other) and \
|
|
||||||
all(p==q for p, q in zip(self.items(), other.items()))
|
|
||||||
return dict.__eq__(self, other)
|
|
||||||
|
|
||||||
def __ne__(self, other):
|
|
||||||
return not self == other
|
|
@ -1,77 +0,0 @@
|
|||||||
"""JSON token scanner
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
def _import_c_make_scanner():
|
|
||||||
try:
|
|
||||||
from simplejson._speedups import make_scanner
|
|
||||||
return make_scanner
|
|
||||||
except ImportError:
|
|
||||||
return None
|
|
||||||
c_make_scanner = _import_c_make_scanner()
|
|
||||||
|
|
||||||
__all__ = ['make_scanner']
|
|
||||||
|
|
||||||
NUMBER_RE = re.compile(
|
|
||||||
r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
|
|
||||||
(re.VERBOSE | re.MULTILINE | re.DOTALL))
|
|
||||||
|
|
||||||
def py_make_scanner(context):
|
|
||||||
parse_object = context.parse_object
|
|
||||||
parse_array = context.parse_array
|
|
||||||
parse_string = context.parse_string
|
|
||||||
match_number = NUMBER_RE.match
|
|
||||||
encoding = context.encoding
|
|
||||||
strict = context.strict
|
|
||||||
parse_float = context.parse_float
|
|
||||||
parse_int = context.parse_int
|
|
||||||
parse_constant = context.parse_constant
|
|
||||||
object_hook = context.object_hook
|
|
||||||
object_pairs_hook = context.object_pairs_hook
|
|
||||||
memo = context.memo
|
|
||||||
|
|
||||||
def _scan_once(string, idx):
|
|
||||||
try:
|
|
||||||
nextchar = string[idx]
|
|
||||||
except IndexError:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
if nextchar == '"':
|
|
||||||
return parse_string(string, idx + 1, encoding, strict)
|
|
||||||
elif nextchar == '{':
|
|
||||||
return parse_object((string, idx + 1), encoding, strict,
|
|
||||||
_scan_once, object_hook, object_pairs_hook, memo)
|
|
||||||
elif nextchar == '[':
|
|
||||||
return parse_array((string, idx + 1), _scan_once)
|
|
||||||
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
|
|
||||||
return None, idx + 4
|
|
||||||
elif nextchar == 't' and string[idx:idx + 4] == 'true':
|
|
||||||
return True, idx + 4
|
|
||||||
elif nextchar == 'f' and string[idx:idx + 5] == 'false':
|
|
||||||
return False, idx + 5
|
|
||||||
|
|
||||||
m = match_number(string, idx)
|
|
||||||
if m is not None:
|
|
||||||
integer, frac, exp = m.groups()
|
|
||||||
if frac or exp:
|
|
||||||
res = parse_float(integer + (frac or '') + (exp or ''))
|
|
||||||
else:
|
|
||||||
res = parse_int(integer)
|
|
||||||
return res, m.end()
|
|
||||||
elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
|
|
||||||
return parse_constant('NaN'), idx + 3
|
|
||||||
elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
|
|
||||||
return parse_constant('Infinity'), idx + 8
|
|
||||||
elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
|
|
||||||
return parse_constant('-Infinity'), idx + 9
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
def scan_once(string, idx):
|
|
||||||
try:
|
|
||||||
return _scan_once(string, idx)
|
|
||||||
finally:
|
|
||||||
memo.clear()
|
|
||||||
|
|
||||||
return scan_once
|
|
||||||
|
|
||||||
make_scanner = c_make_scanner or py_make_scanner
|
|
@ -1,39 +0,0 @@
|
|||||||
r"""Command-line tool to validate and pretty-print JSON
|
|
||||||
|
|
||||||
Usage::
|
|
||||||
|
|
||||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
|
||||||
{
|
|
||||||
"json": "obj"
|
|
||||||
}
|
|
||||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
|
||||||
Expecting property name: line 1 column 2 (char 2)
|
|
||||||
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import simplejson as json
|
|
||||||
|
|
||||||
def main():
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
infile = sys.stdin
|
|
||||||
outfile = sys.stdout
|
|
||||||
elif len(sys.argv) == 2:
|
|
||||||
infile = open(sys.argv[1], 'rb')
|
|
||||||
outfile = sys.stdout
|
|
||||||
elif len(sys.argv) == 3:
|
|
||||||
infile = open(sys.argv[1], 'rb')
|
|
||||||
outfile = open(sys.argv[2], 'wb')
|
|
||||||
else:
|
|
||||||
raise SystemExit(sys.argv[0] + " [infile [outfile]]")
|
|
||||||
try:
|
|
||||||
obj = json.load(infile,
|
|
||||||
object_pairs_hook=json.OrderedDict,
|
|
||||||
use_decimal=True)
|
|
||||||
except ValueError, e:
|
|
||||||
raise SystemExit(e)
|
|
||||||
json.dump(obj, outfile, sort_keys=True, indent=' ', use_decimal=True)
|
|
||||||
outfile.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -121,9 +121,11 @@ document 7
|
|||||||
# TODO: unicode support is pretty sloppy. define it better.
|
# TODO: unicode support is pretty sloppy. define it better.
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import htmlentitydefs
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import types
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
from urlparse import urlsplit, urlunsplit
|
from urlparse import urlsplit, urlunsplit
|
||||||
@ -149,15 +151,9 @@ try:
|
|||||||
# For Python < 2.6 or people using a newer version of simplejson
|
# For Python < 2.6 or people using a newer version of simplejson
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
except ImportError:
|
except ImportError:
|
||||||
try:
|
|
||||||
# For Python >= 2.6
|
# For Python >= 2.6
|
||||||
import json
|
import json
|
||||||
except ImportError:
|
|
||||||
# Jython has no in-built JSON package, so we'll use the specially
|
|
||||||
# included simplejson version. We don't want that to override the
|
|
||||||
# Python-bundled version though, hence this hack.
|
|
||||||
import jython_simplejson as simplejson
|
|
||||||
import jython_simplejson as json
|
|
||||||
try:
|
try:
|
||||||
# Desirable from a timeout perspective.
|
# Desirable from a timeout perspective.
|
||||||
from httplib2 import Http
|
from httplib2 import Http
|
||||||
@ -171,9 +167,17 @@ try:
|
|||||||
except NameError:
|
except NameError:
|
||||||
from sets import Set as set
|
from sets import Set as set
|
||||||
|
|
||||||
|
try:
|
||||||
|
# TODO: perhaps refactor to requests when https://github.com/kennethreitz/requests/issues/68 lands?
|
||||||
|
from poster.encode import multipart_encode
|
||||||
|
POSTER_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
POSTER_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
__author__ = 'Joseph Kocherhans, Jacob Kaplan-Moss, Daniel Lindsley'
|
__author__ = 'Joseph Kocherhans, Jacob Kaplan-Moss, Daniel Lindsley'
|
||||||
__all__ = ['Solr']
|
__all__ = ['Solr']
|
||||||
__version__ = (2, 0, 14)
|
__version__ = (2, 1, 0, 'beta')
|
||||||
|
|
||||||
def get_version():
|
def get_version():
|
||||||
return "%s.%s.%s" % __version__[:3]
|
return "%s.%s.%s" % __version__[:3]
|
||||||
@ -200,6 +204,34 @@ if False:
|
|||||||
LOG.addHandler(stream)
|
LOG.addHandler(stream)
|
||||||
|
|
||||||
|
|
||||||
|
def unescape_html(text):
|
||||||
|
"""
|
||||||
|
Removes HTML or XML character references and entities from a text string.
|
||||||
|
|
||||||
|
@param text The HTML (or XML) source text.
|
||||||
|
@return The plain text, as a Unicode string, if necessary.
|
||||||
|
|
||||||
|
Source: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
|
"""
|
||||||
|
def fixup(m):
|
||||||
|
text = m.group(0)
|
||||||
|
if text[:2] == "&#":
|
||||||
|
# character reference
|
||||||
|
try:
|
||||||
|
if text[:3] == "&#x":
|
||||||
|
return unichr(int(text[3:-1], 16))
|
||||||
|
else:
|
||||||
|
return unichr(int(text[2:-1]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# named entity
|
||||||
|
try:
|
||||||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return text # leave as is
|
||||||
|
return re.sub("&#?\w+;", fixup, text)
|
||||||
|
|
||||||
def safe_urlencode(params, doseq=0):
|
def safe_urlencode(params, doseq=0):
|
||||||
"""
|
"""
|
||||||
@ -231,13 +263,15 @@ class SolrError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class Results(object):
|
class Results(object):
|
||||||
def __init__(self, docs, hits, highlighting=None, facets=None, spellcheck=None, stats=None):
|
def __init__(self, docs, hits, highlighting=None, facets=None, spellcheck=None, stats=None, qtime=None, debug=None):
|
||||||
self.docs = docs
|
self.docs = docs
|
||||||
self.hits = hits
|
self.hits = hits
|
||||||
self.highlighting = highlighting or {}
|
self.highlighting = highlighting or {}
|
||||||
self.facets = facets or {}
|
self.facets = facets or {}
|
||||||
self.spellcheck = spellcheck or {}
|
self.spellcheck = spellcheck or {}
|
||||||
self.stats = stats or {}
|
self.stats = stats or {}
|
||||||
|
self.qtime = qtime
|
||||||
|
self.debug = debug or {}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.docs)
|
return len(self.docs)
|
||||||
@ -379,7 +413,7 @@ class Solr(object):
|
|||||||
msg = "[Reason: %s]" % reason
|
msg = "[Reason: %s]" % reason
|
||||||
|
|
||||||
if reason is None:
|
if reason is None:
|
||||||
msg += "\n%s" % full_html
|
msg += "\n%s" % unescape_html(full_html)
|
||||||
|
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
@ -391,10 +425,10 @@ class Solr(object):
|
|||||||
server_type = None
|
server_type = None
|
||||||
server_string = headers.get('server', '')
|
server_string = headers.get('server', '')
|
||||||
|
|
||||||
if 'jetty' in server_string.lower():
|
if server_string and 'jetty' in server_string.lower():
|
||||||
server_type = 'jetty'
|
server_type = 'jetty'
|
||||||
|
|
||||||
if 'coyote' in server_string.lower():
|
if server_string and 'coyote' in server_string.lower():
|
||||||
# TODO: During the pysolr 3 effort, make this no longer a
|
# TODO: During the pysolr 3 effort, make this no longer a
|
||||||
# conditional and consider using ``lxml.html`` instead.
|
# conditional and consider using ``lxml.html`` instead.
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
@ -529,6 +563,9 @@ class Solr(object):
|
|||||||
result = self.decoder.decode(response)
|
result = self.decoder.decode(response)
|
||||||
result_kwargs = {}
|
result_kwargs = {}
|
||||||
|
|
||||||
|
if result.get('debug'):
|
||||||
|
result_kwargs['debug'] = result['debug']
|
||||||
|
|
||||||
if result.get('highlighting'):
|
if result.get('highlighting'):
|
||||||
result_kwargs['highlighting'] = result['highlighting']
|
result_kwargs['highlighting'] = result['highlighting']
|
||||||
|
|
||||||
@ -541,6 +578,9 @@ class Solr(object):
|
|||||||
if result.get('stats'):
|
if result.get('stats'):
|
||||||
result_kwargs['stats'] = result['stats']
|
result_kwargs['stats'] = result['stats']
|
||||||
|
|
||||||
|
if 'QTime' in result.get('responseHeader', {}):
|
||||||
|
result_kwargs['qtime'] = result['responseHeader']['QTime']
|
||||||
|
|
||||||
self.log.debug("Found '%s' search results." % result['response']['numFound'])
|
self.log.debug("Found '%s' search results." % result['response']['numFound'])
|
||||||
return Results(result['response']['docs'], result['response']['numFound'], **result_kwargs)
|
return Results(result['response']['docs'], result['response']['numFound'], **result_kwargs)
|
||||||
|
|
||||||
@ -587,10 +627,15 @@ class Solr(object):
|
|||||||
terms = result.get("terms", {})
|
terms = result.get("terms", {})
|
||||||
res = {}
|
res = {}
|
||||||
|
|
||||||
while terms:
|
# in Solr 1.x the value of terms is a flat list:
|
||||||
# The raw values are a flat list: ["dance",23,"dancers",10,"dancing",8,"dancer",6]]
|
# ["field_name", ["dance",23,"dancers",10,"dancing",8,"dancer",6]]
|
||||||
field = terms.pop(0)
|
#
|
||||||
values = terms.pop(0)
|
# in Solr 3.x the value of terms is a dict:
|
||||||
|
# {"field_name": ["dance",23,"dancers",10,"dancing",8,"dancer",6]}
|
||||||
|
if isinstance(terms, types.ListType):
|
||||||
|
terms = dict(zip(terms[0::2], terms[1::2]))
|
||||||
|
|
||||||
|
for field, values in terms.iteritems():
|
||||||
tmp = list()
|
tmp = list()
|
||||||
|
|
||||||
while values:
|
while values:
|
||||||
@ -684,6 +729,80 @@ class Solr(object):
|
|||||||
msg = '<commit />'
|
msg = '<commit />'
|
||||||
response = self._update('<optimize />', waitFlush=waitFlush, waitSearcher=waitSearcher)
|
response = self._update('<optimize />', waitFlush=waitFlush, waitSearcher=waitSearcher)
|
||||||
|
|
||||||
|
def extract(self, file_obj, extractOnly=True):
|
||||||
|
"""
|
||||||
|
POSTs a file to the Solr ExtractingRequestHandler so rich content can
|
||||||
|
be processed using Apache Tika. See the Solr wiki for details:
|
||||||
|
|
||||||
|
http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||||
|
|
||||||
|
The ExtractingRequestHandler has a very simply model: it extracts
|
||||||
|
contents and metadata from the uploaded file and inserts it directly
|
||||||
|
into the index. This is rarely useful as it allows no way to store
|
||||||
|
additional data or otherwise customize the record. Instead, by default
|
||||||
|
we'll use the extract-only mode to extract the data without indexing it
|
||||||
|
so the caller has the opportunity to process it as appropriate; call
|
||||||
|
with ``extractOnly=False`` if you want to insert with no additional
|
||||||
|
processing.
|
||||||
|
|
||||||
|
Returns None if metadata cannot be extracted; otherwise returns a
|
||||||
|
dictionary containing at least two keys:
|
||||||
|
|
||||||
|
:contents:
|
||||||
|
Extracted full-text content, if applicable
|
||||||
|
:metadata:
|
||||||
|
key:value pairs of text strings
|
||||||
|
"""
|
||||||
|
if not POSTER_AVAILABLE:
|
||||||
|
raise RuntimeError("Solr rich content extraction requires `poster` to be installed")
|
||||||
|
|
||||||
|
# The poster library unfortunately defaults to mime-type None when
|
||||||
|
# the file lacks a name and that causes it to send the file contents
|
||||||
|
# as a gigantic string rather than a separate MIME part, which breaks
|
||||||
|
# and spews the contents in the Solr request log:
|
||||||
|
if not hasattr(file_obj, "name"):
|
||||||
|
raise ValueError("extract() requires file-like objects which have a defined name property")
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"extractOnly": "true" if extractOnly else "false",
|
||||||
|
"lowernames": "true",
|
||||||
|
"wt": "json",
|
||||||
|
# We'll provide the file using its true name as Tika may use that
|
||||||
|
# as a file type hint:
|
||||||
|
file_obj.name: file_obj,
|
||||||
|
}
|
||||||
|
|
||||||
|
body_generator, headers = multipart_encode(params)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self._send_request('POST', "%s/update/extract" % self.path,
|
||||||
|
"".join(body_generator), headers)
|
||||||
|
except (IOError, SolrError), e:
|
||||||
|
self.log.error("Failed to extract document metadata: %s", e,
|
||||||
|
exc_info=e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(resp)
|
||||||
|
except ValueError, e:
|
||||||
|
self.log.error("Failed to load JSON response: %s", e,
|
||||||
|
exc_info=e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
data['contents'] = data.pop(file_obj.name, None)
|
||||||
|
data['metadata'] = metadata = {}
|
||||||
|
|
||||||
|
raw_metadata = data.pop("%s_metadata" % file_obj.name, None)
|
||||||
|
|
||||||
|
if raw_metadata:
|
||||||
|
# The raw format is somewhat annoying: it's a flat list of
|
||||||
|
# alternating keys and value lists
|
||||||
|
while raw_metadata:
|
||||||
|
metadata[raw_metadata.pop()] = raw_metadata.pop()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SolrCoreAdmin(object):
|
class SolrCoreAdmin(object):
|
||||||
"""
|
"""
|
||||||
|
@ -33,13 +33,6 @@ from formencode import validators
|
|||||||
from classregistry import findClass
|
from classregistry import findClass
|
||||||
from itertools import count
|
from itertools import count
|
||||||
|
|
||||||
# Jython doesn't have the buffer sequence type (bug #1521).
|
|
||||||
# using this workaround instead.
|
|
||||||
try:
|
|
||||||
buffer
|
|
||||||
except NameError, e:
|
|
||||||
buffer = str
|
|
||||||
|
|
||||||
NoDefault = sqlbuilder.NoDefault
|
NoDefault = sqlbuilder.NoDefault
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -1,13 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
from array import array
|
from array import array
|
||||||
|
|
||||||
# Jython doesn't have the buffer sequence type (bug #1521).
|
|
||||||
# using this workaround instead.
|
|
||||||
try:
|
|
||||||
buffer
|
|
||||||
except NameError, e:
|
|
||||||
buffer = str
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import mx.DateTime.ISO
|
import mx.DateTime.ISO
|
||||||
origISOStr = mx.DateTime.ISO.strGMT
|
origISOStr = mx.DateTime.ISO.strGMT
|
||||||
|
@ -1037,4 +1037,3 @@ import postgres
|
|||||||
import rdbhost
|
import rdbhost
|
||||||
import sqlite
|
import sqlite
|
||||||
import sybase
|
import sybase
|
||||||
import jython_sqlite
|
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
from sqlobject.dbconnection import registerConnection
|
|
||||||
|
|
||||||
def builder():
|
|
||||||
import sqliteconnection
|
|
||||||
return sqliteconnection.SQLiteConnection
|
|
||||||
|
|
||||||
registerConnection(['jython_sqlite'], builder)
|
|
@ -1,356 +0,0 @@
|
|||||||
import base64
|
|
||||||
import os
|
|
||||||
import thread
|
|
||||||
import urllib
|
|
||||||
from sqlobject.dbconnection import DBAPI, Boolean
|
|
||||||
from sqlobject import col, sqlbuilder
|
|
||||||
from sqlobject.dberrors import *
|
|
||||||
|
|
||||||
sqlite2_Binary = None
|
|
||||||
|
|
||||||
class ErrorMessage(str):
|
|
||||||
def __new__(cls, e):
|
|
||||||
obj = str.__new__(cls, e[0])
|
|
||||||
obj.code = None
|
|
||||||
obj.module = e.__module__
|
|
||||||
obj.exception = e.__class__.__name__
|
|
||||||
return obj
|
|
||||||
|
|
||||||
class SQLiteConnection(DBAPI):
|
|
||||||
|
|
||||||
supportTransactions = True
|
|
||||||
dbName = 'sqlite'
|
|
||||||
schemes = [dbName]
|
|
||||||
|
|
||||||
def __init__(self, filename, autoCommit=1, **kw):
|
|
||||||
from com.ziclix.python.sql import zxJDBC as sqlite
|
|
||||||
self.module = sqlite
|
|
||||||
#self.using_sqlite2 = False
|
|
||||||
self.filename = filename # full path to sqlite-db-file
|
|
||||||
self._memory = filename == ':memory:'
|
|
||||||
#if self._memory and not self.using_sqlite2:
|
|
||||||
# raise ValueError("You must use sqlite2 to use in-memory databases")
|
|
||||||
opts = { }
|
|
||||||
opts['autocommit'] = autoCommit
|
|
||||||
|
|
||||||
# use only one connection for sqlite - supports multiple)
|
|
||||||
# cursors per connection
|
|
||||||
self._connOptions = opts
|
|
||||||
self.use_table_info = Boolean(kw.pop("use_table_info", True))
|
|
||||||
DBAPI.__init__(self, **kw)
|
|
||||||
self._threadPool = {}
|
|
||||||
self._threadOrigination = {}
|
|
||||||
if self._memory:
|
|
||||||
self._memoryConn = self.module.connect('jdbc:sqlite::memory:', None, None, 'org.sqlite.JDBC')
|
|
||||||
self._memoryConn.autocommit = autoCommit
|
|
||||||
# Convert text data from SQLite to str, not unicode -
|
|
||||||
# SQLObject converts it to unicode itself.
|
|
||||||
#self._memoryConn.text_factory = str
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _connectionFromParams(cls, user, password, host, port, path, args):
|
|
||||||
if host == ':memory:':
|
|
||||||
host = None
|
|
||||||
|
|
||||||
assert host is None and port is None, (
|
|
||||||
"SQLite can only be used locally (with a URI like "
|
|
||||||
"sqlite:/file or sqlite:///file, not sqlite://%s%s)" %
|
|
||||||
(host, port and ':%r' % port or ''))
|
|
||||||
assert user is None and password is None, (
|
|
||||||
"You may not provide usernames or passwords for SQLite "
|
|
||||||
"databases")
|
|
||||||
if path == "/:memory:":
|
|
||||||
path = ":memory:"
|
|
||||||
return cls(filename=path, **args)
|
|
||||||
|
|
||||||
def oldUri(self):
|
|
||||||
path = self.filename
|
|
||||||
if path == ":memory:":
|
|
||||||
path = "/:memory:"
|
|
||||||
else:
|
|
||||||
path = "//" + path
|
|
||||||
return 'sqlite:%s' % path
|
|
||||||
|
|
||||||
def uri(self):
|
|
||||||
path = self.filename
|
|
||||||
if path == ":memory:":
|
|
||||||
path = "/:memory:"
|
|
||||||
else:
|
|
||||||
if path.startswith('/'):
|
|
||||||
path = "//" + path
|
|
||||||
else:
|
|
||||||
path = "///" + path
|
|
||||||
path = urllib.quote(path)
|
|
||||||
return 'sqlite:%s' % path
|
|
||||||
|
|
||||||
def getConnection(self):
|
|
||||||
# SQLite can't share connections between threads, and so can't
|
|
||||||
# pool connections. Since we are isolating threads here, we
|
|
||||||
# don't have to worry about locking as much.
|
|
||||||
if self._memory:
|
|
||||||
conn = self.makeConnection()
|
|
||||||
self._connectionNumbers[id(conn)] = self._connectionCount
|
|
||||||
self._connectionCount += 1
|
|
||||||
return conn
|
|
||||||
threadid = thread.get_ident()
|
|
||||||
if (self._pool is not None
|
|
||||||
and threadid in self._threadPool):
|
|
||||||
conn = self._threadPool[threadid]
|
|
||||||
del self._threadPool[threadid]
|
|
||||||
if conn in self._pool:
|
|
||||||
self._pool.remove(conn)
|
|
||||||
else:
|
|
||||||
conn = self.makeConnection()
|
|
||||||
if self._pool is not None:
|
|
||||||
self._threadOrigination[id(conn)] = threadid
|
|
||||||
self._connectionNumbers[id(conn)] = self._connectionCount
|
|
||||||
self._connectionCount += 1
|
|
||||||
if self.debug:
|
|
||||||
s = 'ACQUIRE'
|
|
||||||
if self._pool is not None:
|
|
||||||
s += ' pool=[%s]' % ', '.join([str(self._connectionNumbers[id(v)]) for v in self._pool])
|
|
||||||
self.printDebug(conn, s, 'Pool')
|
|
||||||
return conn
|
|
||||||
|
|
||||||
def releaseConnection(self, conn, explicit=False):
|
|
||||||
if self._memory:
|
|
||||||
return
|
|
||||||
threadid = self._threadOrigination.get(id(conn))
|
|
||||||
DBAPI.releaseConnection(self, conn, explicit=explicit)
|
|
||||||
if (self._pool is not None and threadid
|
|
||||||
and threadid not in self._threadPool):
|
|
||||||
self._threadPool[threadid] = conn
|
|
||||||
else:
|
|
||||||
if self._pool and conn in self._pool:
|
|
||||||
self._pool.remove(conn)
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def _setAutoCommit(self, conn, auto):
|
|
||||||
conn.autocommit = auto
|
|
||||||
|
|
||||||
def _setIsolationLevel(self, conn, level):
|
|
||||||
# apparently not applicable for sqlite2 drivers
|
|
||||||
return
|
|
||||||
|
|
||||||
def makeConnection(self):
|
|
||||||
if self._memory:
|
|
||||||
return self._memoryConn
|
|
||||||
|
|
||||||
conn = self.module.connect('jdbc:sqlite:%s' % self.filename, '', '', 'org.sqlite.JDBC')
|
|
||||||
conn.autocommit = self._connOptions.get('autocommit', 1)
|
|
||||||
# TODO: zxjdbc.connect does not have a text_factory property
|
|
||||||
#conn.text_factory = str # Convert text data to str, not unicode
|
|
||||||
return conn
|
|
||||||
|
|
||||||
def _executeRetry(self, conn, cursor, query):
|
|
||||||
if self.debug:
|
|
||||||
self.printDebug(conn, query, 'QueryR')
|
|
||||||
try:
|
|
||||||
return cursor.execute(query)
|
|
||||||
except self.module.OperationalError, e:
|
|
||||||
raise OperationalError(ErrorMessage(e))
|
|
||||||
except self.module.IntegrityError, e:
|
|
||||||
msg = ErrorMessage(e)
|
|
||||||
if msg.startswith('column') and msg.endswith('not unique'):
|
|
||||||
raise DuplicateEntryError(msg)
|
|
||||||
else:
|
|
||||||
raise IntegrityError(msg)
|
|
||||||
except self.module.InternalError, e:
|
|
||||||
raise InternalError(ErrorMessage(e))
|
|
||||||
except self.module.ProgrammingError, e:
|
|
||||||
raise ProgrammingError(ErrorMessage(e))
|
|
||||||
except self.module.DataError, e:
|
|
||||||
raise DataError(ErrorMessage(e))
|
|
||||||
except self.module.NotSupportedError, e:
|
|
||||||
raise NotSupportedError(ErrorMessage(e))
|
|
||||||
except self.module.DatabaseError, e:
|
|
||||||
raise DatabaseError(ErrorMessage(e))
|
|
||||||
except self.module.InterfaceError, e:
|
|
||||||
raise InterfaceError(ErrorMessage(e))
|
|
||||||
except self.module.Warning, e:
|
|
||||||
raise Warning(ErrorMessage(e))
|
|
||||||
except self.module.Error, e:
|
|
||||||
raise Error(ErrorMessage(e))
|
|
||||||
|
|
||||||
def _queryInsertID(self, conn, soInstance, id, names, values):
|
|
||||||
table = soInstance.sqlmeta.table
|
|
||||||
idName = soInstance.sqlmeta.idName
|
|
||||||
c = conn.cursor()
|
|
||||||
if id is not None:
|
|
||||||
names = [idName] + names
|
|
||||||
values = [id] + values
|
|
||||||
q = self._insertSQL(table, names, values)
|
|
||||||
if self.debug:
|
|
||||||
self.printDebug(conn, q, 'QueryIns')
|
|
||||||
self._executeRetry(conn, c, q)
|
|
||||||
# lastrowid is a DB-API extension from "PEP 0249":
|
|
||||||
if id is None:
|
|
||||||
if c.lastrowid:
|
|
||||||
id = c.lastrowid
|
|
||||||
else:
|
|
||||||
# the Java SQLite JDBC driver doesn't seem to have implemented
|
|
||||||
# the lastrowid extension, so we have to do this manually.
|
|
||||||
# Also getMetaData().getGeneratedKeys() is inaccessible.
|
|
||||||
# TODO: make this a prepared statement?
|
|
||||||
self._executeRetry(conn, c, 'select last_insert_rowid()')
|
|
||||||
id = c.fetchone()[0]
|
|
||||||
if self.debugOutput:
|
|
||||||
self.printDebug(conn, id, 'QueryIns', 'result')
|
|
||||||
return id
|
|
||||||
|
|
||||||
def _insertSQL(self, table, names, values):
|
|
||||||
if not names:
|
|
||||||
assert not values
|
|
||||||
# INSERT INTO table () VALUES () isn't allowed in
|
|
||||||
# SQLite (though it is in other databases)
|
|
||||||
return ("INSERT INTO %s VALUES (NULL)" % table)
|
|
||||||
else:
|
|
||||||
return DBAPI._insertSQL(self, table, names, values)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _queryAddLimitOffset(cls, query, start, end):
|
|
||||||
if not start:
|
|
||||||
return "%s LIMIT %i" % (query, end)
|
|
||||||
if not end:
|
|
||||||
return "%s LIMIT 0 OFFSET %i" % (query, start)
|
|
||||||
return "%s LIMIT %i OFFSET %i" % (query, end-start, start)
|
|
||||||
|
|
||||||
def createColumn(self, soClass, col):
|
|
||||||
return col.sqliteCreateSQL()
|
|
||||||
|
|
||||||
def createReferenceConstraint(self, soClass, col):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def createIDColumn(self, soClass):
|
|
||||||
return self._createIDColumn(soClass.sqlmeta)
|
|
||||||
|
|
||||||
def _createIDColumn(self, sqlmeta):
|
|
||||||
if sqlmeta.idType == str:
|
|
||||||
return '%s TEXT PRIMARY KEY' % sqlmeta.idName
|
|
||||||
return '%s INTEGER PRIMARY KEY AUTOINCREMENT' % sqlmeta.idName
|
|
||||||
|
|
||||||
def joinSQLType(self, join):
|
|
||||||
return 'INT NOT NULL'
|
|
||||||
|
|
||||||
def tableExists(self, tableName):
|
|
||||||
result = self.queryOne("SELECT tbl_name FROM sqlite_master WHERE type='table' AND tbl_name = '%s'" % tableName)
|
|
||||||
# turn it into a boolean:
|
|
||||||
return not not result
|
|
||||||
|
|
||||||
def createIndexSQL(self, soClass, index):
|
|
||||||
return index.sqliteCreateIndexSQL(soClass)
|
|
||||||
|
|
||||||
def addColumn(self, tableName, column):
|
|
||||||
self.query('ALTER TABLE %s ADD COLUMN %s' %
|
|
||||||
(tableName,
|
|
||||||
column.sqliteCreateSQL()))
|
|
||||||
self.query('VACUUM %s' % tableName)
|
|
||||||
|
|
||||||
def delColumn(self, sqlmeta, column):
|
|
||||||
self.recreateTableWithoutColumn(sqlmeta, column)
|
|
||||||
|
|
||||||
def recreateTableWithoutColumn(self, sqlmeta, column):
|
|
||||||
new_name = sqlmeta.table + '_ORIGINAL'
|
|
||||||
self.query('ALTER TABLE %s RENAME TO %s' % (sqlmeta.table, new_name))
|
|
||||||
cols = [self._createIDColumn(sqlmeta)] \
|
|
||||||
+ [self.createColumn(None, col)
|
|
||||||
for col in sqlmeta.columnList if col.name != column.name]
|
|
||||||
cols = ",\n".join([" %s" % c for c in cols])
|
|
||||||
self.query('CREATE TABLE %s (\n%s\n)' % (sqlmeta.table, cols))
|
|
||||||
all_columns = ', '.join([sqlmeta.idName] + [col.dbName for col in sqlmeta.columnList])
|
|
||||||
self.query('INSERT INTO %s (%s) SELECT %s FROM %s' % (
|
|
||||||
sqlmeta.table, all_columns, all_columns, new_name))
|
|
||||||
self.query('DROP TABLE %s' % new_name)
|
|
||||||
|
|
||||||
def columnsFromSchema(self, tableName, soClass):
|
|
||||||
if self.use_table_info:
|
|
||||||
return self._columnsFromSchemaTableInfo(tableName, soClass)
|
|
||||||
else:
|
|
||||||
return self._columnsFromSchemaParse(tableName, soClass)
|
|
||||||
|
|
||||||
def _columnsFromSchemaTableInfo(self, tableName, soClass):
|
|
||||||
colData = self.queryAll("PRAGMA table_info(%s)" % tableName)
|
|
||||||
results = []
|
|
||||||
for index, field, t, nullAllowed, default, key in colData:
|
|
||||||
if field == soClass.sqlmeta.idName:
|
|
||||||
continue
|
|
||||||
colClass, kw = self.guessClass(t)
|
|
||||||
if default == 'NULL':
|
|
||||||
nullAllowed = True
|
|
||||||
default = None
|
|
||||||
kw['name'] = soClass.sqlmeta.style.dbColumnToPythonAttr(field)
|
|
||||||
kw['dbName'] = field
|
|
||||||
kw['notNone'] = not nullAllowed
|
|
||||||
kw['default'] = default
|
|
||||||
# @@ skip key...
|
|
||||||
# @@ skip extra...
|
|
||||||
results.append(colClass(**kw))
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _columnsFromSchemaParse(self, tableName, soClass):
|
|
||||||
colData = self.queryOne("SELECT sql FROM sqlite_master WHERE type='table' AND name='%s'"
|
|
||||||
% tableName)
|
|
||||||
if not colData:
|
|
||||||
raise ValueError('The table %s was not found in the database. Load failed.' % tableName)
|
|
||||||
colData = colData[0].split('(', 1)[1].strip()[:-2]
|
|
||||||
while True:
|
|
||||||
start = colData.find('(')
|
|
||||||
if start == -1: break
|
|
||||||
end = colData.find(')', start)
|
|
||||||
if end == -1: break
|
|
||||||
colData = colData[:start] + colData[end+1:]
|
|
||||||
results = []
|
|
||||||
for colDesc in colData.split(','):
|
|
||||||
parts = colDesc.strip().split(' ', 2)
|
|
||||||
field = parts[0].strip()
|
|
||||||
# skip comments
|
|
||||||
if field.startswith('--'):
|
|
||||||
continue
|
|
||||||
# get rid of enclosing quotes
|
|
||||||
if field[0] == field[-1] == '"':
|
|
||||||
field = field[1:-1]
|
|
||||||
if field == getattr(soClass.sqlmeta, 'idName', 'id'):
|
|
||||||
continue
|
|
||||||
colClass, kw = self.guessClass(parts[1].strip())
|
|
||||||
if len(parts) == 2:
|
|
||||||
index_info = ''
|
|
||||||
else:
|
|
||||||
index_info = parts[2].strip().upper()
|
|
||||||
kw['name'] = soClass.sqlmeta.style.dbColumnToPythonAttr(field)
|
|
||||||
kw['dbName'] = field
|
|
||||||
import re
|
|
||||||
nullble = re.search(r'(\b\S*)\sNULL', index_info)
|
|
||||||
default = re.search(r"DEFAULT\s((?:\d[\dA-FX.]*)|(?:'[^']*')|(?:#[^#]*#))", index_info)
|
|
||||||
kw['notNone'] = nullble and nullble.group(1) == 'NOT'
|
|
||||||
kw['default'] = default and default.group(1)
|
|
||||||
# @@ skip key...
|
|
||||||
# @@ skip extra...
|
|
||||||
results.append(colClass(**kw))
|
|
||||||
return results
|
|
||||||
|
|
||||||
def guessClass(self, t):
|
|
||||||
t = t.upper()
|
|
||||||
if t.find('INT') >= 0:
|
|
||||||
return col.IntCol, {}
|
|
||||||
elif t.find('TEXT') >= 0 or t.find('CHAR') >= 0 or t.find('CLOB') >= 0:
|
|
||||||
return col.StringCol, {'length': 2**32-1}
|
|
||||||
elif t.find('BLOB') >= 0:
|
|
||||||
return col.BLOBCol, {"length": 2**32-1}
|
|
||||||
elif t.find('REAL') >= 0 or t.find('FLOAT') >= 0:
|
|
||||||
return col.FloatCol, {}
|
|
||||||
elif t.find('DECIMAL') >= 0:
|
|
||||||
return col.DecimalCol, {'size': None, 'precision': None}
|
|
||||||
elif t.find('BOOL') >= 0:
|
|
||||||
return col.BoolCol, {}
|
|
||||||
else:
|
|
||||||
return col.Col, {}
|
|
||||||
|
|
||||||
def createEmptyDatabase(self):
|
|
||||||
if self._memory:
|
|
||||||
return
|
|
||||||
open(self.filename, 'w').close()
|
|
||||||
|
|
||||||
def dropDatabase(self):
|
|
||||||
if self._memory:
|
|
||||||
return
|
|
||||||
os.unlink(self.filename)
|
|
Loading…
x
Reference in New Issue
Block a user