mirror of
https://github.com/djohnlewis/stackdump
synced 2025-12-06 07:53:28 +00:00
Initial commit. Still building up the env and some parsing code.
This commit is contained in:
38
python/dataproc/get_sites.py
Normal file
38
python/dataproc/get_sites.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script reads a directory containing the Stack Exchange data dump and
|
||||
# returns a list of sites.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
CONTENT_FILENAME_RE = re.compile(r'^(.+)\.7z$|^(.+)\.7z\.\d{3}$')
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print('One argument is expected - the path to the data dump directory.')
|
||||
sys.exit(1)
|
||||
|
||||
dump_path = sys.argv[1]
|
||||
print('Using the data dump path: ' + dump_path + '\n')
|
||||
|
||||
if not os.path.exists(dump_path):
|
||||
print('The given data dump path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# we expect it to contain an 'Content' directory
|
||||
dump_path = os.path.join(dump_path, 'Content')
|
||||
if not os.path.exists(dump_path):
|
||||
print('The given data dump path is invalid. The Content subdirectory was expected.')
|
||||
sys.exit(1)
|
||||
|
||||
filenames = os.listdir(dump_path)
|
||||
sites = set()
|
||||
|
||||
for f in filenames:
|
||||
match = CONTENT_FILENAME_RE.match(f)
|
||||
if match:
|
||||
sites.add(match.group(match.lastindex))
|
||||
|
||||
print sites
|
||||
|
||||
94
python/dataproc/insert.py
Normal file
94
python/dataproc/insert.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script takes the path of a set of
|
||||
# returns a list of sites.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import xml.sax
|
||||
from datetime import datetime
|
||||
|
||||
from sqlobject import *
|
||||
|
||||
# MODELS
|
||||
class Badge(SQLObject):
|
||||
userId = IntCol()
|
||||
name = StringCol()
|
||||
date = DateTimeCol()
|
||||
|
||||
# SAX HANDLERS
|
||||
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
|
||||
|
||||
class BadgeContentHandler(xml.sax.ContentHandler):
|
||||
"""
|
||||
Parses the string -
|
||||
|
||||
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
|
||||
"""
|
||||
def __init__(self):
|
||||
self.cur_props = None
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name != 'row':
|
||||
return
|
||||
|
||||
try:
|
||||
d = self.cur_props = { }
|
||||
d['userId'] = int(attrs.get('UserId', 0))
|
||||
d['name'] = attrs.get('Name', '')
|
||||
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
|
||||
except Exception, e:
|
||||
# could not parse this, so ignore the row completely
|
||||
self.cur_props = None
|
||||
print('[badge] Exception: ' + str(e))
|
||||
print('[badge] Could not parse the row ' + repr(attrs))
|
||||
|
||||
def endElement(self, name):
|
||||
if name != 'row':
|
||||
return
|
||||
|
||||
if not self.cur_props:
|
||||
return
|
||||
|
||||
# the cur_props is now complete. Save it.
|
||||
try:
|
||||
# the object is automatically saved to the database on creation
|
||||
Badge(**self.cur_props)
|
||||
except Exception, e:
|
||||
# could not insert this, so ignore the row
|
||||
print('[badge] Exception: ' + str(e))
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print('[badge] Could not insert the row ' + repr(self.cur_props))
|
||||
|
||||
self.cur_props = None
|
||||
|
||||
|
||||
# MAIN METHOD
|
||||
if len(sys.argv) != 2:
|
||||
print('One argument is expected - the path to the extracted XML files.')
|
||||
sys.exit(1)
|
||||
|
||||
xml_root = sys.argv[1]
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
print('The given XML root path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
temp_db_path = '/tmp/stackdump_import_temp.sqlite'
|
||||
if os.path.exists(temp_db_path):
|
||||
os.remove(temp_db_path)
|
||||
|
||||
# create the temp database
|
||||
sqlhub.processConnection = connectionForURI('jython_sqlite://' + temp_db_path)
|
||||
|
||||
# BADGES
|
||||
print('[badge] PARSING BADGES...')
|
||||
print('[badge] creating badge table...')
|
||||
Badge.createTable()
|
||||
xml_path = os.path.join(xml_root, 'badges.xml')
|
||||
print('[badge] start parsing badges.xml...')
|
||||
xml.sax.parse(xml_path, BadgeContentHandler())
|
||||
print('[badge] FINISHED PARSING BADGES.\n')
|
||||
|
||||
Reference in New Issue
Block a user