Renamed dataproc management commands to better names.

2025-04-03 00:03:30 +00:00 · 2012-02-12 13:55:18 +11:00 · 2012-02-12 13:55:18 +11:00 · adccd41724
commit adccd41724
parent f075580a2e
3 changed files with 220 additions and 91 deletions
--- a/python/src/stackdump/dataproc/delete_site.py
+++ b/python/src/stackdump/dataproc/delete_site.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python
-
-# This script deletes the site specified by the ID in the first parameter.
-
-import os
-import sys
-
-from sqlobject import sqlhub, connectionForURI, AND, OR
-from pysolr import Solr
-
-from stackdump.models import Site
-
-script_dir = os.path.dirname(sys.argv[0])
-
-if len(sys.argv) < 2:
-    print 'The site ID needs to be specified as the first parameter.'
-    sys.exit(1)
-
-# connect to the data sources
-db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
-
-# connect to the database
-print('Connecting to the database...')
-conn_str = 'sqlite://' + db_path
-sqlhub.processConnection = connectionForURI(conn_str)
-print('Connected.\n')
-
-# connect to solr
-print('Connecting to solr...')
-solr = Solr("http://localhost:8983/solr/")
-print('Connected.\n')
-
-site_id = int(sys.argv[1])
-site = Site.select(Site.q.id==site_id).getOne(None)
-if not site:
-    print 'Site ID %d does not exist.' % site_id
-    sys.exit(1)
-
-site_name = site.name
-print('Deleting site "%s" from the database... ' % site.name)
-sys.stdout.flush()
-Site.delete(site.id) # the relationship cascades, so other rows will be deleted
-print('Deleted.\n')
-
-print('Deleting site "%s" from solr... ' % site_name)
-solr.delete(q='siteName:"%s"' % site_name)
-print('Deleted.\n')
--- a/python/src/stackdump/dataproc/import.py
+++ b/python/src/stackdump/dataproc/import.py
@ -6,13 +6,16 @@ from __future__ import with_statement

 import sys
 import os
+import time
 import xml.sax
 from datetime import datetime
 import re
 from optparse import OptionParser
 from xml.etree import ElementTree

-from sqlobject import sqlhub, connectionForURI, AND, OR
+from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObject
+from sqlobject.sqlbuilder import Delete, Insert
+from sqlobject.styles import DefaultStyle
 from pysolr import Solr

 from stackdump.models import Site, Badge, Comment, User
@ -38,6 +41,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
        self.obj_class = obj_class
        self.cur_props = None
        self.row_count = 0
+        self.db_style = DefaultStyle()
    
    def endElement(self, name):
        if name != 'row':
@ -53,7 +57,21 @@ class BaseContentHandler(xml.sax.ContentHandler):
        # the cur_props is now complete. Save it.
        try:
            # the object is automatically saved to the database on creation
-            self.obj_class(**self.cur_props)
+            # adding records using the SQLObject object takes too long
+            #self.obj_class(**self.cur_props)
+            
+            # so we're going to go closer to the metal
+            props_for_db = { }
+            for k,v in self.cur_props.items():
+                # if this is a reference to a FK, massage the values to fit
+                if isinstance(v, SQLObject):
+                    k += 'Id'
+                    v = v.id
+                # need to convert the attr names to DB column names
+                props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
+            
+            conn.query(conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
+            
        except Exception, e:
            # could not insert this, so ignore the row
            print('Exception: ' + str(e))
@ -64,7 +82,9 @@ class BaseContentHandler(xml.sax.ContentHandler):
        self.cur_props = None
        
        if self.row_count % 1000 == 0:
-            print('[badge]\t\tProcessed %d rows.' % (self.row_count))
+            print('%-10s Processed %d rows.' % ('[%s]' % self.obj_class.sqlmeta.table,
+                                               self.row_count)
+                  )

 class BadgeContentHandler(BaseContentHandler):
    """
@ -206,6 +226,7 @@ class PostContentHandler(xml.sax.ContentHandler):
    def __init__(self, site):
        self.site = site
        self.unfinished_questions = { }
+        self.orphan_answers = { }
        self.cur_props = None
        self.row_count = 0
    
@ -233,7 +254,7 @@ class PostContentHandler(xml.sax.ContentHandler):
                d['answerCount'] = int(attrs.get('AnswerCount', 0))
                d['viewCount'] = int(attrs.get('ViewCount', 0))
            else:
-                raise ValueError('Unknown PostTypeId [%s] for row ID [%s]' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
+                raise ValueError('Unknown PostTypeId [%s] for row ID [%s]. Probably a tag wiki page.' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
            
            if 'AcceptedAnswerId' in attrs:
                d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
@ -281,24 +302,13 @@ class PostContentHandler(xml.sax.ContentHandler):
        try:
            d = self.cur_props
            
-            # find, convert to JSON and attach any comments for this question
-            comments = Comment.select(AND(Comment.q.site == self.site,
-                                          Comment.q.postId == int(d['id'])))
-            for comment in comments:
-                c = { }
-                for f in Comment.json_fields:
-                    c[f] = getattr(comment, f)
-                d['comments'].append(c)
-            
-            if len(d['comments']) != d['commentCount']:
-                print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (d['id'], d['commentCount'], len(d['comments'])))
-            
            # the cur_props is now complete. Stash it away until question is complete.
            if d.has_key('parentId'):
                # this is an answer.
                if not self.unfinished_questions.has_key(d['parentId']):
-                    print('lookup keys: ' + repr(self.unfinished_questions.keys()))
-                    raise ValueError("This answer's [ID# %s] question [ID# %s] has not been processed yet. Incorrect order in XML? Ignoring answer." % (d['id'], d['parentId']))
+                    if not self.orphan_answers.has_key(d['parentId']):
+                        self.orphan_answers[d['parentId']] = [ ]
+                    self.orphan_answers[d['parentId']].append(d)
                else:
                    self.unfinished_questions[d['parentId']]['answers'].append(d)
            else:
@ -309,6 +319,11 @@ class PostContentHandler(xml.sax.ContentHandler):
                                     (d['id'], d['title'], self.unfinished_questions[d['id']]['title']))
                else:
                    self.unfinished_questions[d['id']] = d
+                    # check if any of the orphan answers are for this question
+                    if self.orphan_answers.has_key(d['id']):
+                        d['answers'].extend(self.orphan_answers[d['id']])
+                        # remove orphan answers from the orphan list
+                        del self.orphan_answers[d['id']]
            
        except Exception, e:
            # could not insert this, so ignore the row
@ -320,9 +335,11 @@ class PostContentHandler(xml.sax.ContentHandler):
        self.cur_props = None
        
        if self.row_count % 1000 == 0:
-            print('\tProcessed %d rows.' % (self.row_count))
+            print('%-10s Processed %d rows.' % ('[post]', self.row_count))
        
-        self.commit_finished_questions()
+        # only check for finished questions every 200 rows to speed things up
+        if self.row_count % 200 == 0:
+            self.commit_finished_questions()
    
    def commit_finished_questions(self):
        # check if any questions are now complete (answerCount=len(answers))
@ -356,6 +373,7 @@ class PostContentHandler(xml.sax.ContentHandler):
        the search index in the form that we want.
        
        Things this does -
+        * fetch comments for question and answers and attach them to the objects
        * creates the 'text' field for the search index that contains all the
          text of the question (title, question, answers and all comments).
        * serialises answers to JSON
@ -365,6 +383,46 @@ class PostContentHandler(xml.sax.ContentHandler):
        * add question JSON to document
        * commit document to search index.
        """
+        # find and attach any comments for this question and its answers
+        # get the set of post ids
+        post_ids = set()
+        post_ids.add(q['id'])
+        for a in q['answers']:
+            post_ids.add(a['id'])
+        
+        # get the comments
+        comment_objs = Comment.select(AND(Comment.q.site == self.site,
+                                          IN(Comment.q.postId, list(post_ids))))
+        
+        # sort the comments out into a dict keyed on the post id
+        comments = { }
+        for c in comment_objs:
+            # convert comment object to a JSON-serialisable object
+            comment_json = { }
+            for f in Comment.json_fields:
+                comment_json[f] = getattr(c, f)
+            
+            # we already know that this comment comes from the current site, so
+            # we only need to filter on post ID
+            if not comments.has_key(c.postId):
+                comments[c.postId] = [ ]
+            comments[c.postId].append(comment_json)
+        
+        # add comments to the question
+        if comments.has_key(q['id']):
+            q['comments'].extend(comments[q['id']])
+        
+        if len(q['comments']) != q['commentCount']:
+            print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (q['id'], q['commentCount'], len(q['comments'])))
+        
+        # add comments to the answers
+        for a in q['answers']:
+            if comments.has_key(a['id']):
+                a['comments'].extend(comments[a['id']])
+            
+            if len(a['comments']) != a['commentCount']:
+                print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (a['id'], a['commentCount'], len(a['comments'])))
+        
        doc = { }
        
        # create the text field contents
@ -442,6 +500,8 @@ class PostContentHandler(xml.sax.ContentHandler):
        
        Should be called after all XML has been parsed.
        """
+        self.commit_finished_questions()
+        
        for id,q in self.unfinished_questions.items():
            print('Question [ID# %d] was expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))

@ -458,6 +518,10 @@ class PostContentHandler(xml.sax.ContentHandler):

        # we're committing all questions, so nothing is now unfinished
        self.unfinished_questions.clear()
+        
+        # check if there are any orphan answers
+        for question_id, answers in self.orphan_answers.items():
+            print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))

 # MAIN METHOD
 parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
@ -469,7 +533,7 @@ parser.add_option('-c', '--dump-date', help='Dump date of the site.')
 (cmd_options, cmd_args) = parser.parse_args()

 if len(cmd_args) < 1:
-    print('The path to the extracted XML files is required.')
+    print('The path to the directory containing the extracted XML files is required.')
    sys.exit(1)

 xml_root = cmd_args[0]
@ -557,65 +621,92 @@ if not (site_name and site_key and site_desc and dump_date):
    sys.exit(1)

 # check if site is already in database; if so, purge the data.
-sites = Site.select(Site.q.name==site_name)
-# the site really shouldn't exist more than once, but just in case
-for site in sites:
+site = list(Site.select(Site.q.key==site_key))
+if len(site) > 0:
+    site = site[0]
    print('Deleting site "%s" from the database... ' % site.name)
    sys.stdout.flush()
-    Site.delete(site.id) # the relationship cascades, so other rows will be deleted
+    # Using SQLObject to delete rows takes too long, so we're going to do it directly
+    #Site.delete(site.id) # the relationship cascades, so other rows will be deleted
+    sqlhub.threadConnection = sqlhub.processConnection.transaction()
+    conn = sqlhub.threadConnection
+    # these deletions are done in this order to avoid FK constraint issues
+    print('\tDeleting comments...')
+    conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site==site))))
+    print('\tDeleting badges...')
+    conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
+    print('\tDeleting users...')
+    conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
+    print('\tDeleting site...')
+    conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
+    sqlhub.threadConnection.commit(close=True)
    print('Deleted.\n')
    
    print('Deleting site "%s" from the solr... ' % site.name)
    solr.delete(q='siteKey:"%s"' % site.key)
    print('Deleted.\n')

+timing_start = time.time()
+
+# start a new transaction
+sqlhub.threadConnection = sqlhub.processConnection.transaction()
+conn = sqlhub.threadConnection
+
 # create a new Site
 site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())

 # BADGES
-print('[badge] PARSING BADGES...')
-sqlhub.threadConnection = sqlhub.processConnection.transaction()
-xml_path = os.path.join(xml_root, 'badges.xml')
-print('[badge] start parsing badges.xml...')
-handler = BadgeContentHandler(site)
-xml.sax.parse(xml_path, handler)
-sqlhub.threadConnection.commit(close=True)
-print('[badge]\t\tProcessed %d rows.' % (handler.row_count))
-print('[badge] FINISHED PARSING BADGES.\n')
+# Processing of badges has been disabled because they don't offer any useful
+# information in the offline situation.
+#print('[badge] PARSING BADGES...')
+#xml_path = os.path.join(xml_root, 'badges.xml')
+#print('[badge] start parsing badges.xml...')
+#handler = BadgeContentHandler(site)
+#xml.sax.parse(xml_path, handler)
+#print('[badge]\tProcessed %d rows.' % (handler.row_count))
+#print('[badge] FINISHED PARSING BADGES.\n')

 # COMMENTS
+# comments are temporarily stored in the database for retrieval when parsing
+# posts only. 
 print('[comment] PARSING COMMENTS...')
-sqlhub.threadConnection = sqlhub.processConnection.transaction()
 xml_path = os.path.join(xml_root, 'comments.xml')
 print('[comment] start parsing comments.xml...')
 handler = CommentContentHandler(site)
 xml.sax.parse(xml_path, handler)
-sqlhub.threadConnection.commit(close=True)
-print('[comment]\tProcessed %d rows.' % (handler.row_count))
+print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
 print('[comment] FINISHED PARSING COMMENTS.\n')

 # USERS
 print('[user] PARSING USERS...')
-sqlhub.threadConnection = sqlhub.processConnection.transaction()
 xml_path = os.path.join(xml_root, 'users.xml')
 print('[user] start parsing users.xml...')
 handler = UserContentHandler(site)
 xml.sax.parse(xml_path, handler)
-sqlhub.threadConnection.commit(close=True)
-print('[user]\t\tProcessed %d rows.' % (handler.row_count))
+print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
 print('[user] FINISHED PARSING USERS.\n')

 # POSTS
+# posts are added directly to the Solr index; they are not added to the database.
 print('[post] PARSING POSTS...')
-sqlhub.threadConnection = sqlhub.processConnection.transaction()
 xml_path = os.path.join(xml_root, 'posts.xml')
 print('[post] start parsing posts.xml...')
 handler = PostContentHandler(site)
 xml.sax.parse(xml_path, handler)
 handler.commit_all_questions()
-sqlhub.threadConnection.commit(close=True)
-print('[post]\tProcessed %d rows.' % (handler.row_count))
+print('%-10s Processed %d rows.' % ('[post]', handler.row_count))

 print('[post] FINISHED PARSING POSTS.\n')

-# TODO: delete comments?
+# DELETE COMMENTS
+print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
+conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
+print('[comment] FINISHED DELETING COMMENTS.\n')
+
+# commit transaction
+sqlhub.threadConnection.commit(close=True)
+
+timing_end = time.time()
+
+print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
+print('')
--- a/python/src/stackdump/dataproc/manage_sites.py
+++ b/python/src/stackdump/dataproc/manage_sites.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+##
+# This script provides the ability to manage sites in Stackdump.
+##
+
+import os
+import sys
+from optparse import OptionParser
+
+from sqlobject import sqlhub, connectionForURI
+from pysolr import Solr
+
+from stackdump.models import Site
+
+script_dir = os.path.dirname(sys.argv[0])
+
+# FUNCTIONS
+def list_sites():
+    # connect to the data sources
+    db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
+    
+    # connect to the database
+    print('Connecting to the database...')
+    conn_str = 'sqlite://' + db_path
+    sqlhub.processConnection = connectionForURI(conn_str)
+    print('Connected.\n')
+    
+    sites = list(Site.select()) # force the lazy method to execute
+    
+    if len(sites) > 0:
+        print('[site key] site name')
+        print('-' * 80)
+        for site in sites:
+            print('[%s] %s' % (site.key, site.name))
+    
+def delete_site(site_key):
+    # connect to the data sources
+    db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
+    
+    # connect to the database
+    print('Connecting to the database...')
+    conn_str = 'sqlite://' + db_path
+    sqlhub.processConnection = connectionForURI(conn_str)
+    print('Connected.\n')
+    
+    # connect to solr
+    print('Connecting to solr...')
+    solr = Solr("http://localhost:8983/solr/")
+    print('Connected.\n') 
+    
+    site = Site.select(Site.q.key==site_key).getOne(None)
+    if not site:
+        print 'Site key %s does not exist.' % site_key
+        sys.exit(1)
+    
+    sqlhub.threadConnection = sqlhub.processConnection.transaction()
+    
+    print('Deleting site "%s" from the database... ' % site.name)
+    sys.stdout.flush()
+    Site.delete(site.id) # the relationship cascades, so other rows will be deleted
+    print('Deleted.\n')
+    
+    print('Deleting site "%s" from solr... ' % site.name)
+    solr.delete(q='siteKey:"%s"' % site_key)
+    print('Deleted.\n')
+    
+    sqlhub.threadConnection.commit(close=True)
+
+# END FUNCTIONS
+
+# MAIN METHOD
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option('-l', '--list-sites', help='List sites imported into Stackdump.', action="store_true")
+    parser.add_option('-d', '--delete-site', help='Delete a site from Stackdump.', metavar='SITE_KEY')
+    
+    (cmd_options, cmd_args) = parser.parse_args()
+    
+    if cmd_options.list_sites:
+        list_sites()
+    elif cmd_options.delete_site:
+        delete_site(cmd_options.delete_site)
+    else:
+        parser.print_help()