1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-12-06 07:53:28 +00:00

Initial commit. Still building up the env and some parsing code.

This commit is contained in:
Samuel Lai
2011-09-11 14:29:39 +10:00
commit af2eafeccd
301 changed files with 82327 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python
# This script reads a directory containing the Stack Exchange data dump and
# returns a list of sites.
import sys
import os
import re
CONTENT_FILENAME_RE = re.compile(r'^(.+)\.7z$|^(.+)\.7z\.\d{3}$')
if len(sys.argv) != 2:
print('One argument is expected - the path to the data dump directory.')
sys.exit(1)
dump_path = sys.argv[1]
print('Using the data dump path: ' + dump_path + '\n')
if not os.path.exists(dump_path):
print('The given data dump path does not exist.')
sys.exit(1)
# we expect it to contain an 'Content' directory
dump_path = os.path.join(dump_path, 'Content')
if not os.path.exists(dump_path):
print('The given data dump path is invalid. The Content subdirectory was expected.')
sys.exit(1)
filenames = os.listdir(dump_path)
sites = set()
for f in filenames:
match = CONTENT_FILENAME_RE.match(f)
if match:
sites.add(match.group(match.lastindex))
print sites

94
python/dataproc/insert.py Normal file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python
# This script takes the path of a set of
# returns a list of sites.
import sys
import os
import xml.sax
from datetime import datetime
from sqlobject import *
# MODELS
class Badge(SQLObject):
userId = IntCol()
name = StringCol()
date = DateTimeCol()
# SAX HANDLERS
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
class BadgeContentHandler(xml.sax.ContentHandler):
"""
Parses the string -
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
"""
def __init__(self):
self.cur_props = None
def startElement(self, name, attrs):
if name != 'row':
return
try:
d = self.cur_props = { }
d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '')
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
except Exception, e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('[badge] Exception: ' + str(e))
print('[badge] Could not parse the row ' + repr(attrs))
def endElement(self, name):
if name != 'row':
return
if not self.cur_props:
return
# the cur_props is now complete. Save it.
try:
# the object is automatically saved to the database on creation
Badge(**self.cur_props)
except Exception, e:
# could not insert this, so ignore the row
print('[badge] Exception: ' + str(e))
import traceback
traceback.print_exc()
print('[badge] Could not insert the row ' + repr(self.cur_props))
self.cur_props = None
# MAIN METHOD
if len(sys.argv) != 2:
print('One argument is expected - the path to the extracted XML files.')
sys.exit(1)
xml_root = sys.argv[1]
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
print('The given XML root path does not exist.')
sys.exit(1)
temp_db_path = '/tmp/stackdump_import_temp.sqlite'
if os.path.exists(temp_db_path):
os.remove(temp_db_path)
# create the temp database
sqlhub.processConnection = connectionForURI('jython_sqlite://' + temp_db_path)
# BADGES
print('[badge] PARSING BADGES...')
print('[badge] creating badge table...')
Badge.createTable()
xml_path = os.path.join(xml_root, 'badges.xml')
print('[badge] start parsing badges.xml...')
xml.sax.parse(xml_path, BadgeContentHandler())
print('[badge] FINISHED PARSING BADGES.\n')