mirror of
https://github.com/djohnlewis/stackdump
synced 2024-12-04 06:57:36 +00:00
Python update
This commit is contained in:
parent
f53efd1422
commit
4616d04c56
2
.gitignore
vendored
2
.gitignore
vendored
@ -4,7 +4,7 @@
|
||||
.DS_Store
|
||||
|
||||
# ignore any data
|
||||
^data/*$
|
||||
data
|
||||
|
||||
# ignore working bytecode
|
||||
\.class$
|
||||
|
1
python/src/stackdump/commands/default_settings.py
Symbolic link
1
python/src/stackdump/commands/default_settings.py
Symbolic link
@ -0,0 +1 @@
|
||||
../default_settings.py
|
@ -1,33 +1,49 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script downloads the sites RSS file and associated logos from the net.
|
||||
|
||||
import tarfile
|
||||
import urllib.request
|
||||
from xml.etree import ElementTree
|
||||
import sys
|
||||
import os, ssl
|
||||
def printf(format, *args):
|
||||
sys.stdout.write(format % args)
|
||||
from shutil import copy
|
||||
import os, ssl, fnmatch
|
||||
from optparse import OptionParser
|
||||
from xml.etree import ElementTree
|
||||
import elasticsearch
|
||||
|
||||
import settings
|
||||
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
|
||||
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
|
||||
from sqlobject.sqlbuilder import Delete, Insert
|
||||
from sqlobject.styles import DefaultStyle
|
||||
from pysolr import Solr, SolrError
|
||||
|
||||
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
|
||||
getattr(ssl, '_create_unverified_context', None)):
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
ssl._create_defat_https_context = ssl._create_unverified_context
|
||||
|
||||
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||
|
||||
sites_file_path = os.path.join(script_dir, ''
|
||||
'../../../../data/')
|
||||
# ensure the data directory exists\\\\
|
||||
# download the sites RSS file
|
||||
|
||||
if not os.path.exists(os.path.dirname(sites_file_path)):
|
||||
os.mkdir(os.path.dirname(sites_file_path))
|
||||
|
||||
# download the sites RSS file
|
||||
print('Downloading StackExchange sites XML file...', )
|
||||
urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
print('done.')
|
||||
|
||||
print('')
|
||||
|
||||
|
||||
|
||||
# parse sites RSS file and download logosc
|
||||
images_dir_path = os.path.join(script_dir, '../../../media/images')
|
||||
print(os.listdir(images_dir_path))
|
||||
@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):
|
||||
|
||||
with open(sites_path) as f:
|
||||
sites_file = ElementTree.parse(f)
|
||||
rows = sites_file.findall('row')
|
||||
sites = sites_file.findall('row')
|
||||
# print(rows[0].attrib)
|
||||
|
||||
for row in rows:
|
||||
entry_title = row.attrib['Name']
|
||||
print('Entry: ' + entry_title)
|
||||
|
||||
for site in sites:
|
||||
site_title = site.attrib['LongName']
|
||||
site_name = site.attrib['Name']
|
||||
# extract the key from the url - remove the http:// and .com
|
||||
site_key = row.attrib['TinyName']
|
||||
print('Site: ' + site_key)
|
||||
logo_url = row.attrib['ImageUrl']
|
||||
icon_url = row.attrib['IconUrl']
|
||||
badge_url = row.attrib['BadgeIconUrl']
|
||||
site_key = site.attrib['TinyName']
|
||||
site_url = site.attrib['Url'][8:]
|
||||
logo_url = site.attrib['ImageUrl']
|
||||
icon_url = site.attrib['IconUrl']
|
||||
badge_url = site.attrib['BadgeIconUrl']
|
||||
|
||||
|
||||
site_vars = (site_url, site_key, site_name, site_title)
|
||||
# print(site_vars)
|
||||
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
|
||||
try:
|
||||
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
||||
if not os.path.exists(logo_file):
|
||||
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
||||
except:
|
||||
print('Failed download logo for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading logo for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
|
||||
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
||||
except:
|
||||
print('Failed download logo for %s...' % entry_title)
|
||||
print('Failed download ico for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading icon for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
|
||||
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
|
||||
except:
|
||||
print('Failed download ico for %s...' % entry_title)
|
||||
printf('Failed download badge for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading badge for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
|
||||
except:
|
||||
print('Failed download badge for %s...' % entry_title)
|
||||
site_files = []
|
||||
print('Key: ' + site_url)
|
||||
for root, dirs, files in os.walk(se_dir):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, site_url + '*'):
|
||||
print('Match: ' + os.path.join(root, name))
|
||||
site_files.append(os.path.join(root, name))
|
||||
|
||||
print('done.')
|
||||
sites_data = sites_file_path
|
||||
for site_file in site_files:
|
||||
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
|
||||
+ os.sep + os.path.basename(site_file)
|
||||
os.makedirs(dst, exist_ok=True)
|
||||
os.chdir(dst)
|
||||
os.system('tar xzf '+site_file)
|
||||
print('Data: ' + site_file)
|
||||
|
||||
def prepare_site(xml_root, dump_date, site_key):
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
print('The given XML root path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# connect to the database
|
||||
print('Connecting to the Stackdump database...')
|
||||
conn_str = settings.DATABASE_CONN_STR
|
||||
sqlhub.processConnection = connectionForURI(conn_str)
|
||||
print('Connected.\n')
|
||||
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||
parser = OptionParser(usage='usage: %pro'
|
||||
'g [options] xml_root_dir')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
@ -94,6 +140,4 @@ if __name__ == '__main__':
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||
cmd_options.site_desc, cmd_options.site_key,
|
||||
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
|
||||
|
911
python/src/stackdump/commands/import_recent.py
Normal file
911
python/src/stackdump/commands/import_recent.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,6 @@
|
||||
|
||||
# This script takes extracted site files and inserts them into the database.
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
@ -25,13 +23,7 @@ from pysolr import Solr, SolrError
|
||||
|
||||
from stackdump.models import Site, Badge, User
|
||||
from stackdump import settings
|
||||
|
||||
try:
|
||||
# For Python < 2.6 or people using a newer version of simplejson
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
# For Python >= 2.6
|
||||
import json
|
||||
import json
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
|
||||
@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
|
||||
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||
|
||||
|
||||
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
site_base_url, answer_yes=False):
|
||||
def import_site(xml_root, dump_date,site_key)
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
if len(cmd_args) < 1:
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||
cmd_options.site_desc, cmd_options.site_key,
|
||||
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||
import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
|
1
python/src/stackdump/commands/settings.py
Symbolic link
1
python/src/stackdump/commands/settings.py
Symbolic link
@ -0,0 +1 @@
|
||||
../settings.py
|
1
questions (1).json
Normal file
1
questions (1).json
Normal file
File diff suppressed because one or more lines are too long
1
questions.json
Normal file
1
questions.json
Normal file
File diff suppressed because one or more lines are too long
BIN
schema.xlsx
Normal file
BIN
schema.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user