mirror of
https://github.com/djohnlewis/stackdump
synced 2025-02-22 13:14:46 +00:00
Python update
This commit is contained in:
parent
f53efd1422
commit
4616d04c56
2
.gitignore
vendored
2
.gitignore
vendored
@ -4,7 +4,7 @@
|
||||
.DS_Store
|
||||
|
||||
# ignore any data
|
||||
^data/*$
|
||||
data
|
||||
|
||||
# ignore working bytecode
|
||||
\.class$
|
||||
|
1
python/src/stackdump/commands/default_settings.py
Symbolic link
1
python/src/stackdump/commands/default_settings.py
Symbolic link
@ -0,0 +1 @@
|
||||
../default_settings.py
|
@ -1,33 +1,49 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script downloads the sites RSS file and associated logos from the net.
|
||||
|
||||
import tarfile
|
||||
import urllib.request
|
||||
from xml.etree import ElementTree
|
||||
import sys
|
||||
import os, ssl
|
||||
def printf(format, *args):
|
||||
sys.stdout.write(format % args)
|
||||
from shutil import copy
|
||||
import os, ssl, fnmatch
|
||||
from optparse import OptionParser
|
||||
from xml.etree import ElementTree
|
||||
import elasticsearch
|
||||
|
||||
import settings
|
||||
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
|
||||
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
|
||||
from sqlobject.sqlbuilder import Delete, Insert
|
||||
from sqlobject.styles import DefaultStyle
|
||||
from pysolr import Solr, SolrError
|
||||
|
||||
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
|
||||
getattr(ssl, '_create_unverified_context', None)):
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
ssl._create_defat_https_context = ssl._create_unverified_context
|
||||
|
||||
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||
|
||||
sites_file_path = os.path.join(script_dir, ''
|
||||
'../../../../data/')
|
||||
# ensure the data directory exists\\\\
|
||||
# download the sites RSS file
|
||||
|
||||
if not os.path.exists(os.path.dirname(sites_file_path)):
|
||||
os.mkdir(os.path.dirname(sites_file_path))
|
||||
|
||||
# download the sites RSS file
|
||||
print('Downloading StackExchange sites XML file...', )
|
||||
urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
print('done.')
|
||||
|
||||
print('')
|
||||
|
||||
|
||||
|
||||
# parse sites RSS file and download logosc
|
||||
images_dir_path = os.path.join(script_dir, '../../../media/images')
|
||||
print(os.listdir(images_dir_path))
|
||||
@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):
|
||||
|
||||
with open(sites_path) as f:
|
||||
sites_file = ElementTree.parse(f)
|
||||
rows = sites_file.findall('row')
|
||||
sites = sites_file.findall('row')
|
||||
# print(rows[0].attrib)
|
||||
|
||||
for row in rows:
|
||||
entry_title = row.attrib['Name']
|
||||
print('Entry: ' + entry_title)
|
||||
|
||||
for site in sites:
|
||||
site_title = site.attrib['LongName']
|
||||
site_name = site.attrib['Name']
|
||||
# extract the key from the url - remove the http:// and .com
|
||||
site_key = row.attrib['TinyName']
|
||||
print('Site: ' + site_key)
|
||||
logo_url = row.attrib['ImageUrl']
|
||||
icon_url = row.attrib['IconUrl']
|
||||
badge_url = row.attrib['BadgeIconUrl']
|
||||
site_key = site.attrib['TinyName']
|
||||
site_url = site.attrib['Url'][8:]
|
||||
logo_url = site.attrib['ImageUrl']
|
||||
icon_url = site.attrib['IconUrl']
|
||||
badge_url = site.attrib['BadgeIconUrl']
|
||||
|
||||
|
||||
site_vars = (site_url, site_key, site_name, site_title)
|
||||
# print(site_vars)
|
||||
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
|
||||
try:
|
||||
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
||||
if not os.path.exists(logo_file):
|
||||
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
||||
except:
|
||||
print('Failed download logo for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading logo for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
|
||||
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
||||
except:
|
||||
print('Failed download logo for %s...' % entry_title)
|
||||
print('Failed download ico for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading icon for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
|
||||
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
|
||||
except:
|
||||
print('Failed download ico for %s...' % entry_title)
|
||||
printf('Failed download badge for %s...' % site_title)
|
||||
|
||||
try:
|
||||
print('Downloading badge for %s...' % entry_title,
|
||||
urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
|
||||
except:
|
||||
print('Failed download badge for %s...' % entry_title)
|
||||
site_files = []
|
||||
print('Key: ' + site_url)
|
||||
for root, dirs, files in os.walk(se_dir):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, site_url + '*'):
|
||||
print('Match: ' + os.path.join(root, name))
|
||||
site_files.append(os.path.join(root, name))
|
||||
|
||||
print('done.')
|
||||
sites_data = sites_file_path
|
||||
for site_file in site_files:
|
||||
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
|
||||
+ os.sep + os.path.basename(site_file)
|
||||
os.makedirs(dst, exist_ok=True)
|
||||
os.chdir(dst)
|
||||
os.system('tar xzf '+site_file)
|
||||
print('Data: ' + site_file)
|
||||
|
||||
def prepare_site(xml_root, dump_date, site_key):
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
print('The given XML root path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# connect to the database
|
||||
print('Connecting to the Stackdump database...')
|
||||
conn_str = settings.DATABASE_CONN_STR
|
||||
sqlhub.processConnection = connectionForURI(conn_str)
|
||||
print('Connected.\n')
|
||||
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||
parser = OptionParser(usage='usage: %pro'
|
||||
'g [options] xml_root_dir')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
@ -94,6 +140,4 @@ if __name__ == '__main__':
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||
cmd_options.site_desc, cmd_options.site_key,
|
||||
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
|
||||
|
911
python/src/stackdump/commands/import_recent.py
Normal file
911
python/src/stackdump/commands/import_recent.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,6 @@
|
||||
|
||||
# This script takes extracted site files and inserts them into the database.
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
@ -25,13 +23,7 @@ from pysolr import Solr, SolrError
|
||||
|
||||
from stackdump.models import Site, Badge, User
|
||||
from stackdump import settings
|
||||
|
||||
try:
|
||||
# For Python < 2.6 or people using a newer version of simplejson
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
# For Python >= 2.6
|
||||
import json
|
||||
import json
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
|
||||
@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
|
||||
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||
|
||||
|
||||
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
site_base_url, answer_yes=False):
|
||||
def import_site(xml_root, dump_date,site_key)
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
if len(cmd_args) < 1:
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||
cmd_options.site_desc, cmd_options.site_key,
|
||||
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||
import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
|
1
python/src/stackdump/commands/settings.py
Symbolic link
1
python/src/stackdump/commands/settings.py
Symbolic link
@ -0,0 +1 @@
|
||||
../settings.py
|
1
questions (1).json
Normal file
1
questions (1).json
Normal file
File diff suppressed because one or more lines are too long
1
questions.json
Normal file
1
questions.json
Normal file
File diff suppressed because one or more lines are too long
BIN
schema.xlsx
Normal file
BIN
schema.xlsx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user