Added scripts for deleting a site from the system, and getting site info from the net.

2025-04-03 00:03:30 +00:00 · 2011-11-01 22:02:25 +11:00 · 2011-11-01 22:02:25 +11:00 · 5e930bbc08
commit 5e930bbc08
parent 489d9aec22
2 changed files with 90 additions and 0 deletions
--- a/python/src/stackdump/dataproc/delete_site.py
+++ b/python/src/stackdump/dataproc/delete_site.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+# This script deletes the site specified by the ID in the first parameter.
+
+import os
+import sys
+
+from sqlobject import sqlhub, connectionForURI, AND, OR
+from pysolr import Solr
+
+from stackdump.models import Site
+
+script_dir = os.path.dirname(sys.argv[0])
+
+if len(sys.argv) < 2:
+    print 'The site ID needs to be specified as the first parameter.'
+    sys.exit(1)
+
+# connect to the data sources
+db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
+
+# connect to the database
+print('Connecting to the database...')
+conn_str = 'sqlite://' + db_path
+sqlhub.processConnection = connectionForURI(conn_str)
+print('Connected.\n')
+
+# connect to solr
+print('Connecting to solr...')
+solr = Solr("http://localhost:8983/solr/")
+print('Connected.\n')
+
+site_id = int(sys.argv[1])
+site = Site.select(Site.q.id==site_id).getOne(None)
+if not site:
+    print 'Site ID %d does not exist.' % site_id
+    sys.exit(1)
+
+site_name = site.name
+print('Deleting site "%s" from the database... ' % site.name)
+sys.stdout.flush()
+Site.delete(site.id) # the relationship cascades, so other rows will be deleted
+print('Deleted.\n')
+
+print('Deleting site "%s" from solr... ' % site_name)
+solr.delete(q='siteName:"%s"' % site_name)
+print('Deleted.\n')
--- a/python/src/stackdump/dataproc/get_sites_info.py
+++ b/python/src/stackdump/dataproc/get_sites_info.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+# This script downloads the sites RSS file and associated logos from the net.
+
+import urllib
+from xml.etree import ElementTree
+import os
+import sys
+
+script_dir = os.path.dirname(sys.argv[0])
+sites_file_path = os.path.join(script_dir, '../../../../data/sites')
+
+# download the sites RSS file
+print 'Downloading StackExchange sites RSS file...',
+urllib.urlretrieve('http://stackexchange.com/feeds/sites', sites_file_path)
+print 'done.'
+
+print ''
+
+# parse sites RSS file and download logos
+logos_dir_path = os.path.join(script_dir, '../../../media/images/logos')
+if not os.path.exists(logos_dir_path):
+    os.mkdir(logos_dir_path)
+
+with open(sites_file_path) as f:
+    sites_file = ElementTree.parse(f)
+    entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
+    
+    for entry in entries:
+        entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text
+        
+        # extract the key from the url - remove the http:// and .com
+        site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
+        if site_key.startswith('http://'):
+            site_key = site_key[len('http://'):]
+        if site_key.endswith('.com'):
+            site_key = site_key[:-len('.com')]
+        if site_key.endswith('.stackexchange'):
+            site_key = site_key[:-len('.stackexchange')]
+        
+        print 'Downloading logo for %s...' % entry_title,
+        urllib.urlretrieve('http://sstatic.net/%s/img/icon-48.png' % site_key, os.path.join(logos_dir_path, '%s.png' % site_key))
+        print 'done.'