mirror of
https://github.com/djohnlewis/stackdump
synced 2025-04-07 10:13:27 +00:00
Renamed dataproc management commands to better names.
This commit is contained in:
parent
f075580a2e
commit
adccd41724
python/src/stackdump/dataproc
@ -1,47 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
# This script deletes the site specified by the ID in the first parameter.
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from sqlobject import sqlhub, connectionForURI, AND, OR
|
|
||||||
from pysolr import Solr
|
|
||||||
|
|
||||||
from stackdump.models import Site
|
|
||||||
|
|
||||||
script_dir = os.path.dirname(sys.argv[0])
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print 'The site ID needs to be specified as the first parameter.'
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# connect to the data sources
|
|
||||||
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
|
|
||||||
|
|
||||||
# connect to the database
|
|
||||||
print('Connecting to the database...')
|
|
||||||
conn_str = 'sqlite://' + db_path
|
|
||||||
sqlhub.processConnection = connectionForURI(conn_str)
|
|
||||||
print('Connected.\n')
|
|
||||||
|
|
||||||
# connect to solr
|
|
||||||
print('Connecting to solr...')
|
|
||||||
solr = Solr("http://localhost:8983/solr/")
|
|
||||||
print('Connected.\n')
|
|
||||||
|
|
||||||
site_id = int(sys.argv[1])
|
|
||||||
site = Site.select(Site.q.id==site_id).getOne(None)
|
|
||||||
if not site:
|
|
||||||
print 'Site ID %d does not exist.' % site_id
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
site_name = site.name
|
|
||||||
print('Deleting site "%s" from the database... ' % site.name)
|
|
||||||
sys.stdout.flush()
|
|
||||||
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
|
||||||
print('Deleted.\n')
|
|
||||||
|
|
||||||
print('Deleting site "%s" from solr... ' % site_name)
|
|
||||||
solr.delete(q='siteName:"%s"' % site_name)
|
|
||||||
print('Deleted.\n')
|
|
@ -6,13 +6,16 @@ from __future__ import with_statement
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
import xml.sax
|
import xml.sax
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
from sqlobject import sqlhub, connectionForURI, AND, OR
|
from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObject
|
||||||
|
from sqlobject.sqlbuilder import Delete, Insert
|
||||||
|
from sqlobject.styles import DefaultStyle
|
||||||
from pysolr import Solr
|
from pysolr import Solr
|
||||||
|
|
||||||
from stackdump.models import Site, Badge, Comment, User
|
from stackdump.models import Site, Badge, Comment, User
|
||||||
@ -38,6 +41,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
self.obj_class = obj_class
|
self.obj_class = obj_class
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
self.row_count = 0
|
self.row_count = 0
|
||||||
|
self.db_style = DefaultStyle()
|
||||||
|
|
||||||
def endElement(self, name):
|
def endElement(self, name):
|
||||||
if name != 'row':
|
if name != 'row':
|
||||||
@ -53,7 +57,21 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
# the cur_props is now complete. Save it.
|
# the cur_props is now complete. Save it.
|
||||||
try:
|
try:
|
||||||
# the object is automatically saved to the database on creation
|
# the object is automatically saved to the database on creation
|
||||||
self.obj_class(**self.cur_props)
|
# adding records using the SQLObject object takes too long
|
||||||
|
#self.obj_class(**self.cur_props)
|
||||||
|
|
||||||
|
# so we're going to go closer to the metal
|
||||||
|
props_for_db = { }
|
||||||
|
for k,v in self.cur_props.items():
|
||||||
|
# if this is a reference to a FK, massage the values to fit
|
||||||
|
if isinstance(v, SQLObject):
|
||||||
|
k += 'Id'
|
||||||
|
v = v.id
|
||||||
|
# need to convert the attr names to DB column names
|
||||||
|
props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
|
||||||
|
|
||||||
|
conn.query(conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
@ -64,7 +82,9 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
|
|
||||||
if self.row_count % 1000 == 0:
|
if self.row_count % 1000 == 0:
|
||||||
print('[badge]\t\tProcessed %d rows.' % (self.row_count))
|
print('%-10s Processed %d rows.' % ('[%s]' % self.obj_class.sqlmeta.table,
|
||||||
|
self.row_count)
|
||||||
|
)
|
||||||
|
|
||||||
class BadgeContentHandler(BaseContentHandler):
|
class BadgeContentHandler(BaseContentHandler):
|
||||||
"""
|
"""
|
||||||
@ -206,6 +226,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
def __init__(self, site):
|
def __init__(self, site):
|
||||||
self.site = site
|
self.site = site
|
||||||
self.unfinished_questions = { }
|
self.unfinished_questions = { }
|
||||||
|
self.orphan_answers = { }
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
self.row_count = 0
|
self.row_count = 0
|
||||||
|
|
||||||
@ -233,7 +254,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
d['answerCount'] = int(attrs.get('AnswerCount', 0))
|
d['answerCount'] = int(attrs.get('AnswerCount', 0))
|
||||||
d['viewCount'] = int(attrs.get('ViewCount', 0))
|
d['viewCount'] = int(attrs.get('ViewCount', 0))
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
|
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]. Probably a tag wiki page.' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
|
||||||
|
|
||||||
if 'AcceptedAnswerId' in attrs:
|
if 'AcceptedAnswerId' in attrs:
|
||||||
d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
|
d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
|
||||||
@ -281,24 +302,13 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
try:
|
try:
|
||||||
d = self.cur_props
|
d = self.cur_props
|
||||||
|
|
||||||
# find, convert to JSON and attach any comments for this question
|
|
||||||
comments = Comment.select(AND(Comment.q.site == self.site,
|
|
||||||
Comment.q.postId == int(d['id'])))
|
|
||||||
for comment in comments:
|
|
||||||
c = { }
|
|
||||||
for f in Comment.json_fields:
|
|
||||||
c[f] = getattr(comment, f)
|
|
||||||
d['comments'].append(c)
|
|
||||||
|
|
||||||
if len(d['comments']) != d['commentCount']:
|
|
||||||
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (d['id'], d['commentCount'], len(d['comments'])))
|
|
||||||
|
|
||||||
# the cur_props is now complete. Stash it away until question is complete.
|
# the cur_props is now complete. Stash it away until question is complete.
|
||||||
if d.has_key('parentId'):
|
if d.has_key('parentId'):
|
||||||
# this is an answer.
|
# this is an answer.
|
||||||
if not self.unfinished_questions.has_key(d['parentId']):
|
if not self.unfinished_questions.has_key(d['parentId']):
|
||||||
print('lookup keys: ' + repr(self.unfinished_questions.keys()))
|
if not self.orphan_answers.has_key(d['parentId']):
|
||||||
raise ValueError("This answer's [ID# %s] question [ID# %s] has not been processed yet. Incorrect order in XML? Ignoring answer." % (d['id'], d['parentId']))
|
self.orphan_answers[d['parentId']] = [ ]
|
||||||
|
self.orphan_answers[d['parentId']].append(d)
|
||||||
else:
|
else:
|
||||||
self.unfinished_questions[d['parentId']]['answers'].append(d)
|
self.unfinished_questions[d['parentId']]['answers'].append(d)
|
||||||
else:
|
else:
|
||||||
@ -309,6 +319,11 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
(d['id'], d['title'], self.unfinished_questions[d['id']]['title']))
|
(d['id'], d['title'], self.unfinished_questions[d['id']]['title']))
|
||||||
else:
|
else:
|
||||||
self.unfinished_questions[d['id']] = d
|
self.unfinished_questions[d['id']] = d
|
||||||
|
# check if any of the orphan answers are for this question
|
||||||
|
if self.orphan_answers.has_key(d['id']):
|
||||||
|
d['answers'].extend(self.orphan_answers[d['id']])
|
||||||
|
# remove orphan answers from the orphan list
|
||||||
|
del self.orphan_answers[d['id']]
|
||||||
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
@ -320,9 +335,11 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
|
|
||||||
if self.row_count % 1000 == 0:
|
if self.row_count % 1000 == 0:
|
||||||
print('\tProcessed %d rows.' % (self.row_count))
|
print('%-10s Processed %d rows.' % ('[post]', self.row_count))
|
||||||
|
|
||||||
self.commit_finished_questions()
|
# only check for finished questions every 200 rows to speed things up
|
||||||
|
if self.row_count % 200 == 0:
|
||||||
|
self.commit_finished_questions()
|
||||||
|
|
||||||
def commit_finished_questions(self):
|
def commit_finished_questions(self):
|
||||||
# check if any questions are now complete (answerCount=len(answers))
|
# check if any questions are now complete (answerCount=len(answers))
|
||||||
@ -356,6 +373,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
the search index in the form that we want.
|
the search index in the form that we want.
|
||||||
|
|
||||||
Things this does -
|
Things this does -
|
||||||
|
* fetch comments for question and answers and attach them to the objects
|
||||||
* creates the 'text' field for the search index that contains all the
|
* creates the 'text' field for the search index that contains all the
|
||||||
text of the question (title, question, answers and all comments).
|
text of the question (title, question, answers and all comments).
|
||||||
* serialises answers to JSON
|
* serialises answers to JSON
|
||||||
@ -365,6 +383,46 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
* add question JSON to document
|
* add question JSON to document
|
||||||
* commit document to search index.
|
* commit document to search index.
|
||||||
"""
|
"""
|
||||||
|
# find and attach any comments for this question and its answers
|
||||||
|
# get the set of post ids
|
||||||
|
post_ids = set()
|
||||||
|
post_ids.add(q['id'])
|
||||||
|
for a in q['answers']:
|
||||||
|
post_ids.add(a['id'])
|
||||||
|
|
||||||
|
# get the comments
|
||||||
|
comment_objs = Comment.select(AND(Comment.q.site == self.site,
|
||||||
|
IN(Comment.q.postId, list(post_ids))))
|
||||||
|
|
||||||
|
# sort the comments out into a dict keyed on the post id
|
||||||
|
comments = { }
|
||||||
|
for c in comment_objs:
|
||||||
|
# convert comment object to a JSON-serialisable object
|
||||||
|
comment_json = { }
|
||||||
|
for f in Comment.json_fields:
|
||||||
|
comment_json[f] = getattr(c, f)
|
||||||
|
|
||||||
|
# we already know that this comment comes from the current site, so
|
||||||
|
# we only need to filter on post ID
|
||||||
|
if not comments.has_key(c.postId):
|
||||||
|
comments[c.postId] = [ ]
|
||||||
|
comments[c.postId].append(comment_json)
|
||||||
|
|
||||||
|
# add comments to the question
|
||||||
|
if comments.has_key(q['id']):
|
||||||
|
q['comments'].extend(comments[q['id']])
|
||||||
|
|
||||||
|
if len(q['comments']) != q['commentCount']:
|
||||||
|
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (q['id'], q['commentCount'], len(q['comments'])))
|
||||||
|
|
||||||
|
# add comments to the answers
|
||||||
|
for a in q['answers']:
|
||||||
|
if comments.has_key(a['id']):
|
||||||
|
a['comments'].extend(comments[a['id']])
|
||||||
|
|
||||||
|
if len(a['comments']) != a['commentCount']:
|
||||||
|
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (a['id'], a['commentCount'], len(a['comments'])))
|
||||||
|
|
||||||
doc = { }
|
doc = { }
|
||||||
|
|
||||||
# create the text field contents
|
# create the text field contents
|
||||||
@ -442,6 +500,8 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
Should be called after all XML has been parsed.
|
Should be called after all XML has been parsed.
|
||||||
"""
|
"""
|
||||||
|
self.commit_finished_questions()
|
||||||
|
|
||||||
for id,q in self.unfinished_questions.items():
|
for id,q in self.unfinished_questions.items():
|
||||||
print('Question [ID# %d] was expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))
|
print('Question [ID# %d] was expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))
|
||||||
|
|
||||||
@ -458,6 +518,10 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
# we're committing all questions, so nothing is now unfinished
|
# we're committing all questions, so nothing is now unfinished
|
||||||
self.unfinished_questions.clear()
|
self.unfinished_questions.clear()
|
||||||
|
|
||||||
|
# check if there are any orphan answers
|
||||||
|
for question_id, answers in self.orphan_answers.items():
|
||||||
|
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
|
||||||
|
|
||||||
# MAIN METHOD
|
# MAIN METHOD
|
||||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||||
@ -469,7 +533,7 @@ parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
|||||||
(cmd_options, cmd_args) = parser.parse_args()
|
(cmd_options, cmd_args) = parser.parse_args()
|
||||||
|
|
||||||
if len(cmd_args) < 1:
|
if len(cmd_args) < 1:
|
||||||
print('The path to the extracted XML files is required.')
|
print('The path to the directory containing the extracted XML files is required.')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
xml_root = cmd_args[0]
|
xml_root = cmd_args[0]
|
||||||
@ -557,65 +621,92 @@ if not (site_name and site_key and site_desc and dump_date):
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# check if site is already in database; if so, purge the data.
|
# check if site is already in database; if so, purge the data.
|
||||||
sites = Site.select(Site.q.name==site_name)
|
site = list(Site.select(Site.q.key==site_key))
|
||||||
# the site really shouldn't exist more than once, but just in case
|
if len(site) > 0:
|
||||||
for site in sites:
|
site = site[0]
|
||||||
print('Deleting site "%s" from the database... ' % site.name)
|
print('Deleting site "%s" from the database... ' % site.name)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
# Using SQLObject to delete rows takes too long, so we're going to do it directly
|
||||||
|
#Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
||||||
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
|
conn = sqlhub.threadConnection
|
||||||
|
# these deletions are done in this order to avoid FK constraint issues
|
||||||
|
print('\tDeleting comments...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site==site))))
|
||||||
|
print('\tDeleting badges...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
|
||||||
|
print('\tDeleting users...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
|
||||||
|
print('\tDeleting site...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
|
||||||
|
sqlhub.threadConnection.commit(close=True)
|
||||||
print('Deleted.\n')
|
print('Deleted.\n')
|
||||||
|
|
||||||
print('Deleting site "%s" from the solr... ' % site.name)
|
print('Deleting site "%s" from the solr... ' % site.name)
|
||||||
solr.delete(q='siteKey:"%s"' % site.key)
|
solr.delete(q='siteKey:"%s"' % site.key)
|
||||||
print('Deleted.\n')
|
print('Deleted.\n')
|
||||||
|
|
||||||
|
timing_start = time.time()
|
||||||
|
|
||||||
|
# start a new transaction
|
||||||
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
|
conn = sqlhub.threadConnection
|
||||||
|
|
||||||
# create a new Site
|
# create a new Site
|
||||||
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())
|
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())
|
||||||
|
|
||||||
# BADGES
|
# BADGES
|
||||||
print('[badge] PARSING BADGES...')
|
# Processing of badges has been disabled because they don't offer any useful
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
# information in the offline situation.
|
||||||
xml_path = os.path.join(xml_root, 'badges.xml')
|
#print('[badge] PARSING BADGES...')
|
||||||
print('[badge] start parsing badges.xml...')
|
#xml_path = os.path.join(xml_root, 'badges.xml')
|
||||||
handler = BadgeContentHandler(site)
|
#print('[badge] start parsing badges.xml...')
|
||||||
xml.sax.parse(xml_path, handler)
|
#handler = BadgeContentHandler(site)
|
||||||
sqlhub.threadConnection.commit(close=True)
|
#xml.sax.parse(xml_path, handler)
|
||||||
print('[badge]\t\tProcessed %d rows.' % (handler.row_count))
|
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
|
||||||
print('[badge] FINISHED PARSING BADGES.\n')
|
#print('[badge] FINISHED PARSING BADGES.\n')
|
||||||
|
|
||||||
# COMMENTS
|
# COMMENTS
|
||||||
|
# comments are temporarily stored in the database for retrieval when parsing
|
||||||
|
# posts only.
|
||||||
print('[comment] PARSING COMMENTS...')
|
print('[comment] PARSING COMMENTS...')
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
|
||||||
xml_path = os.path.join(xml_root, 'comments.xml')
|
xml_path = os.path.join(xml_root, 'comments.xml')
|
||||||
print('[comment] start parsing comments.xml...')
|
print('[comment] start parsing comments.xml...')
|
||||||
handler = CommentContentHandler(site)
|
handler = CommentContentHandler(site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
sqlhub.threadConnection.commit(close=True)
|
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
||||||
print('[comment]\tProcessed %d rows.' % (handler.row_count))
|
|
||||||
print('[comment] FINISHED PARSING COMMENTS.\n')
|
print('[comment] FINISHED PARSING COMMENTS.\n')
|
||||||
|
|
||||||
# USERS
|
# USERS
|
||||||
print('[user] PARSING USERS...')
|
print('[user] PARSING USERS...')
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
|
||||||
xml_path = os.path.join(xml_root, 'users.xml')
|
xml_path = os.path.join(xml_root, 'users.xml')
|
||||||
print('[user] start parsing users.xml...')
|
print('[user] start parsing users.xml...')
|
||||||
handler = UserContentHandler(site)
|
handler = UserContentHandler(site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
sqlhub.threadConnection.commit(close=True)
|
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
|
||||||
print('[user]\t\tProcessed %d rows.' % (handler.row_count))
|
|
||||||
print('[user] FINISHED PARSING USERS.\n')
|
print('[user] FINISHED PARSING USERS.\n')
|
||||||
|
|
||||||
# POSTS
|
# POSTS
|
||||||
|
# posts are added directly to the Solr index; they are not added to the database.
|
||||||
print('[post] PARSING POSTS...')
|
print('[post] PARSING POSTS...')
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
|
||||||
xml_path = os.path.join(xml_root, 'posts.xml')
|
xml_path = os.path.join(xml_root, 'posts.xml')
|
||||||
print('[post] start parsing posts.xml...')
|
print('[post] start parsing posts.xml...')
|
||||||
handler = PostContentHandler(site)
|
handler = PostContentHandler(site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
handler.commit_all_questions()
|
handler.commit_all_questions()
|
||||||
sqlhub.threadConnection.commit(close=True)
|
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
|
||||||
print('[post]\tProcessed %d rows.' % (handler.row_count))
|
|
||||||
|
|
||||||
print('[post] FINISHED PARSING POSTS.\n')
|
print('[post] FINISHED PARSING POSTS.\n')
|
||||||
|
|
||||||
# TODO: delete comments?
|
# DELETE COMMENTS
|
||||||
|
print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
|
||||||
|
print('[comment] FINISHED DELETING COMMENTS.\n')
|
||||||
|
|
||||||
|
# commit transaction
|
||||||
|
sqlhub.threadConnection.commit(close=True)
|
||||||
|
|
||||||
|
timing_end = time.time()
|
||||||
|
|
||||||
|
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
|
||||||
|
print('')
|
85
python/src/stackdump/dataproc/manage_sites.py
Normal file
85
python/src/stackdump/dataproc/manage_sites.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
##
|
||||||
|
# This script provides the ability to manage sites in Stackdump.
|
||||||
|
##
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
from sqlobject import sqlhub, connectionForURI
|
||||||
|
from pysolr import Solr
|
||||||
|
|
||||||
|
from stackdump.models import Site
|
||||||
|
|
||||||
|
script_dir = os.path.dirname(sys.argv[0])
|
||||||
|
|
||||||
|
# FUNCTIONS
|
||||||
|
def list_sites():
|
||||||
|
# connect to the data sources
|
||||||
|
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
|
||||||
|
|
||||||
|
# connect to the database
|
||||||
|
print('Connecting to the database...')
|
||||||
|
conn_str = 'sqlite://' + db_path
|
||||||
|
sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
|
print('Connected.\n')
|
||||||
|
|
||||||
|
sites = list(Site.select()) # force the lazy method to execute
|
||||||
|
|
||||||
|
if len(sites) > 0:
|
||||||
|
print('[site key] site name')
|
||||||
|
print('-' * 80)
|
||||||
|
for site in sites:
|
||||||
|
print('[%s] %s' % (site.key, site.name))
|
||||||
|
|
||||||
|
def delete_site(site_key):
|
||||||
|
# connect to the data sources
|
||||||
|
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
|
||||||
|
|
||||||
|
# connect to the database
|
||||||
|
print('Connecting to the database...')
|
||||||
|
conn_str = 'sqlite://' + db_path
|
||||||
|
sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
|
print('Connected.\n')
|
||||||
|
|
||||||
|
# connect to solr
|
||||||
|
print('Connecting to solr...')
|
||||||
|
solr = Solr("http://localhost:8983/solr/")
|
||||||
|
print('Connected.\n')
|
||||||
|
|
||||||
|
site = Site.select(Site.q.key==site_key).getOne(None)
|
||||||
|
if not site:
|
||||||
|
print 'Site key %s does not exist.' % site_key
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
|
|
||||||
|
print('Deleting site "%s" from the database... ' % site.name)
|
||||||
|
sys.stdout.flush()
|
||||||
|
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
||||||
|
print('Deleted.\n')
|
||||||
|
|
||||||
|
print('Deleting site "%s" from solr... ' % site.name)
|
||||||
|
solr.delete(q='siteKey:"%s"' % site_key)
|
||||||
|
print('Deleted.\n')
|
||||||
|
|
||||||
|
sqlhub.threadConnection.commit(close=True)
|
||||||
|
|
||||||
|
# END FUNCTIONS
|
||||||
|
|
||||||
|
# MAIN METHOD
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = OptionParser()
|
||||||
|
parser.add_option('-l', '--list-sites', help='List sites imported into Stackdump.', action="store_true")
|
||||||
|
parser.add_option('-d', '--delete-site', help='Delete a site from Stackdump.', metavar='SITE_KEY')
|
||||||
|
|
||||||
|
(cmd_options, cmd_args) = parser.parse_args()
|
||||||
|
|
||||||
|
if cmd_options.list_sites:
|
||||||
|
list_sites()
|
||||||
|
elif cmd_options.delete_site:
|
||||||
|
delete_site(cmd_options.delete_site)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
Loading…
x
Reference in New Issue
Block a user