1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-04-07 10:13:27 +00:00

Added some new fields for sites, and ability to look up details from the sites RSS feed.

This commit is contained in:
Samuel Lai 2011-11-01 17:36:14 +11:00
parent 97740a206d
commit 489d9aec22
2 changed files with 72 additions and 18 deletions
python/src/stackdump

@ -9,6 +9,8 @@ import os
import xml.sax import xml.sax
from datetime import datetime from datetime import datetime
import re import re
from optparse import OptionParser
from xml.etree import ElementTree
from sqlobject import sqlhub, connectionForURI, AND, OR from sqlobject import sqlhub, connectionForURI, AND, OR
from pysolr import Solr from pysolr import Solr
@ -457,11 +459,19 @@ class PostContentHandler(xml.sax.ContentHandler):
self.unfinished_questions.clear() self.unfinished_questions.clear()
# MAIN METHOD # MAIN METHOD
if len(sys.argv) != 2: parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
print('One argument is expected - the path to the extracted XML files.') parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the extracted XML files is required.')
sys.exit(1) sys.exit(1)
xml_root = sys.argv[1] xml_root = cmd_args[0]
print('Using the XML root path: ' + xml_root + '\n') print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root): if not os.path.exists(xml_root):
@ -489,36 +499,77 @@ Comment.createTable(ifNotExists=True)
User.createTable(ifNotExists=True) User.createTable(ifNotExists=True)
print('Created.\n') print('Created.\n')
# SITE NAME # SITE INFO
site_name = cmd_options.site_name
dump_date = cmd_options.dump_date
# only look if they were not specified at the command line
if not (site_name and dump_date):
# get the site name from the first line of readme.txt. This could be fragile. # get the site name from the first line of readme.txt. This could be fragile.
with open(os.path.join(xml_root, 'readme.txt')) as f: with open(os.path.join(xml_root, 'readme.txt')) as f:
site_desc = f.readline().strip() site_readme_desc = f.readline().strip()
# assume if there's a colon in the name, the name part is before, and the date # assume if there's a colon in the name, the name part is before, and the date
# part is after. # part is after.
if ':' in site_desc: if ':' in site_readme_desc:
site_name, site_date = site_desc.split(':') site_name, dump_date = site_readme_desc.split(':')
site_name = site_name.strip()
dump_date = dump_date.strip()
else: else:
site_name = site_desc site_name = site_readme_desc
site_date = '' dump_date = None
print('Site name is %s\n' % site_name) # if the phrase ' - Data Dump' is in the site name, remove it
i = site_name.rfind(' - Data Dump')
if i >= 0:
site_name = site_name[:i].strip()
# look for the site in the sites RSS file
site_desc = cmd_options.site_desc
site_key = cmd_options.site_key
if not (site_desc and site_key):
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
if os.path.exists(sites_file_path):
with open(sites_file_path) as f:
sites_file = ElementTree.parse(f)
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
for entry in entries:
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text
if site_name == entry_title:
# this entry matches the detected site name
# extract the key from the url - remove the http:// and .com
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
if site_key.startswith('http://'):
site_key = site_key[len('http://'):]
if site_key.endswith('.com'):
site_key = site_key[:-len('.com')]
if site_key.endswith('.stackexchange'):
site_key = site_key[:-len('.stackexchange')]
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
print 'Name: %s\nKey: %s\nDesc: %s\nDump Date: %s\n' % (site_name, site_key, site_desc, dump_date)
if not (site_name and site_key and site_desc and dump_date):
print 'Could not get all the details for the site.'
print 'Use command-line parameters to specify the missing details (listed as None).'
sys.exit(1)
# check if site is already in database; if so, purge the data. # check if site is already in database; if so, purge the data.
sites = Site.select(Site.q.name==site_name) sites = Site.select(Site.q.name==site_name)
# the site really shouldn't exist more than once, but just in case # the site really shouldn't exist more than once, but just in case
for site in sites: for site in sites:
print('Deleting site "%s" from the database... ' % site.desc) print('Deleting site "%s" from the database... ' % site.name)
sys.stdout.flush() sys.stdout.flush()
Site.delete(site.id) # the relationship cascades, so other rows will be deleted Site.delete(site.id) # the relationship cascades, so other rows will be deleted
print('Deleted.\n') print('Deleted.\n')
print('Deleting site "%s" from the solr... ' % site_desc) print('Deleting site "%s" from the solr... ' % site_name)
solr.delete(q='siteName:"%s"' % site_name) solr.delete(q='siteName:"%s"' % site_name)
print('Deleted.\n') print('Deleted.\n')
# create a new Site # create a new Site
site = Site(name=site_name, desc=site_desc) site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())
# BADGES # BADGES
print('[badge] PARSING BADGES...') print('[badge] PARSING BADGES...')

@ -7,6 +7,9 @@ from sqlobject import *
class Site(SQLObject): class Site(SQLObject):
name = UnicodeCol() name = UnicodeCol()
desc = UnicodeCol() desc = UnicodeCol()
key = UnicodeCol()
dump_date = UnicodeCol()
import_date = DateTimeCol()
class Badge(SQLObject): class Badge(SQLObject):
sourceId = IntCol() sourceId = IntCol()