mirror of
https://github.com/djohnlewis/stackdump
synced 2025-12-17 05:13:32 +00:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea8aefcaf7 | ||
|
|
a10a6d1e4d | ||
|
|
4616d04c56 | ||
|
|
f53efd1422 | ||
|
|
20693a8764 | ||
|
|
f20e281d3d | ||
|
|
0f16cd4bce | ||
|
|
2d27d3efe4 | ||
|
|
3f29d45964 | ||
|
|
dcc7203c97 | ||
|
|
6a7b8ea432 | ||
|
|
c020660479 | ||
|
|
7f6ed7b438 | ||
|
|
a06d2a4c55 | ||
|
|
55bec19665 | ||
|
|
a59e3b59d0 | ||
|
|
40121f2600 | ||
|
|
db026d2ccc | ||
|
|
ae6e10e6c4 | ||
|
|
f79df598d3 |
31
.gitignore
vendored
Normal file
31
.gitignore
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
^JAVA_CMD$
|
||||
^PYTHON_CMD$
|
||||
|
||||
.DS_Store
|
||||
|
||||
# ignore any data
|
||||
data
|
||||
|
||||
# ignore working bytecode
|
||||
\.class$
|
||||
\.pyc$
|
||||
|
||||
^datadump/*
|
||||
|
||||
# ignore test and tutorial directories
|
||||
test/*$
|
||||
tests/*$
|
||||
testsuite/*$
|
||||
tutorial/*$
|
||||
|
||||
# Solr/Jetty
|
||||
^java/solr/server/solr-webapp/*
|
||||
^java/solr/server/logs/*
|
||||
|
||||
# ignore the downloaded logos
|
||||
^python/media/images/logos48
|
||||
^python/media/images/icons
|
||||
^python/media/images/badges
|
||||
|
||||
# PyCharm project files
|
||||
^.idea/
|
||||
3
.idea/.gitignore
generated
vendored
Normal file
3
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/stackdump.iml" filepath="$PROJECT_DIR$/.idea/stackdump.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
8
.idea/stackdump.iml
generated
Normal file
8
.idea/stackdump.iml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
@@ -1,3 +1,8 @@
|
||||
Tutorial (Thai language) : http://www.youtube.com/watch?v=jHEBlHxEKeM
|
||||
"http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/":http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/
|
||||
|
||||
- - - - - - - - - - - - - - - - - - - ORIGINAL README - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
h1. Stackdump - an offline browser for StackExchange sites.
|
||||
|
||||
Stackdump was conceived for those who work in environments that do not have easy access to the StackExchange family of websites. It allows you to host a read-only instance of the StackExchange sites locally, accessible via a web browser.
|
||||
@@ -12,13 +17,15 @@ h2. Screenshots
|
||||
|
||||
h2. System Requirements
|
||||
|
||||
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). It leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
|
||||
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). There have been some reported issues with older versions of Python, so the ideal version to run is v2.7.6 (the latest 2.x version, as of writing). Stackdump also leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
|
||||
|
||||
Besides that, there are no OS-dependent dependencies and should work on any platform that Python and Java run on (although it only comes bundled with Linux scripts at the moment). It was, however, developed and tested on CentOS 5 running Python 2.7 and JRE 6 update 27.
|
||||
|
||||
You will also need "7-zip":http://www.7-zip.org/ to extract the data dump files, but Stackdump does not use it directly so you can perform the extraction on another machine first.
|
||||
|
||||
It is recommended that Stackdump be run on a system with at least 3GB of RAM, particularly if you intend to import StackOverflow into Stackdump. Apache Solr requires a fair bit of memory during the import process. It should also have a fair bit of space available; having at least roughly the space used by the raw, extracted, data dump XML files is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
|
||||
The amount of memory required for Stackdump depends on which dataset you want to import. For most datasets, at least 3GB of RAM is preferable. If you want to import StackOverflow, you must use a 64-bit operating system and a 64-bit version of Python, and also have at least 6GB of RAM available (or swap). If you do not have enough RAM available, the import process will likely fail with a _MemoryError_ message at some point.
|
||||
|
||||
Make sure you have enough disk space too - having at least roughly the space used by the raw, extracted, data dump XML files available is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
|
||||
|
||||
Finally, Stackdump has been tested and works in the latest browsers (IE9, FF10+, Chrome, Safari). It degrades fairly gracefully in older browsers, although some will have rendering issues, e.g. IE8.
|
||||
|
||||
@@ -51,19 +58,25 @@ In total, the StackOverflow data dump has *15,933,529 posts* (questions and answ
|
||||
|
||||
I attempted this on a similarly spec'ed Windows 7 64-bit VM as well - 23 hours later and it is still trying to process the comments. The SQLite, Python or just disk performance is very poor for some reason. Therefore, if you intend on importing StackOverflow, I would advise you to run Stackdump on Linux instead. The smaller sites all complete without a reasonable time though, and there are no perceptible issues with performance as far as I'm aware on Windows.
|
||||
|
||||
h3. Reports on importing the StackOverflow data dump, September 2014
|
||||
|
||||
Due to the growth of the dataset, the import process now requires at least 6GB of RAM. This also means you must use a 64-bit operating system and a 64-bit version of Python.
|
||||
|
||||
h2. Setting up
|
||||
|
||||
Stackdump was designed for offline environments or environments with poor internet access, therefore it is bundled with all the dependencies it requires (with the exception of Python, Java and 7-zip).
|
||||
|
||||
As long as you have:
|
||||
* "Python":http://python.org/download/,
|
||||
* "Java":http://java.com/en/download/manual.jsp,
|
||||
* "Python":http://python.org/download/, version 2.5 or later but not version 3 (tested with v2.7.6),
|
||||
* "Java":http://java.com/en/download/manual.jsp, version 6 (1.6) or later,
|
||||
* "Stackdump":https://bitbucket.org/samuel.lai/stackdump/downloads,
|
||||
* the "StackExchange Data Dump":https://archive.org/details/stackexchange (download the sites you wish to import - note that StackOverflow is split into 7 archive files; only Comments, Posts and Users are required), and
|
||||
* the "StackExchange Data Dump":https://archive.org/details/stackexchange (download the sites you wish to import - note that StackOverflow is split into 7 archive files; only Comments, Posts and Users are required but after extraction the files need to be renamed to Comments.xml, Posts.xml and Users.xml respectively), and
|
||||
* "7-zip":http://www.7-zip.org/ (needed to extract the data dump files)
|
||||
|
||||
...you should be able to get an instance up and running.
|
||||
|
||||
If you are using a 64-bit operating system, get the 64-bit version of Python.
|
||||
|
||||
To provide a better experience, Stackdump can use the RSS feed content to pre-fill some of the required details during the import process, as well as to display the site logos in the app. Stackdump comes bundled with a script that downloads and places these bits in the right places. If you're in a completely offline environment however, it may be worth running this script on a connected box first.
|
||||
|
||||
h3. Windows users
|
||||
@@ -87,7 +100,7 @@ bq. If you're using Java 7 on Linux and you see an error similar to the followin
|
||||
this is because you have SELinux enabled. You will need to tell SELinux to allow Java to run by using the following command as root (amending the path as necessary) -
|
||||
@chcon -t textrel_shlib_t /opt/jre1.7.0_40/lib/i386/server/libjvm.so@
|
||||
|
||||
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3).
|
||||
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3). Ideally this should be v2.7.6 or later as there have been some reported issues with earlier versions.
|
||||
|
||||
If you would rather not put these versions in the PATH (e.g. you don't want to override the default version of Python in your Linux distribution), you can tell Stackdump which Java and/or Python to use explicitly by creating a file named @JAVA_CMD@ or @PYTHON_CMD@ respectively in the Stackdump root directory, and placing the path to the executable in there.
|
||||
|
||||
@@ -165,16 +178,20 @@ bc.. [program:stackdump-solr]
|
||||
command=/path/to/stackdump/start_solr.sh
|
||||
priority=900
|
||||
user=stackdump_user
|
||||
stopasgroup=true
|
||||
stdout_logfile=/path/to/stackdump/solr_stdout.log
|
||||
stderr_logfile=/path/to/stackdump/solr_stderr.log
|
||||
|
||||
[program:stackdump-web]
|
||||
command=/path/to/stackdump/start_web.sh
|
||||
user=stackdump_user
|
||||
stopasgroup=true
|
||||
stdout_logfile=/path/to/stackdump/web_stdout.log
|
||||
stderr_logfile=/path/to/stackdump/web_stderr.log
|
||||
|
||||
p. Yet another option for those using newer Linux distributions is to create native "systemd service definitions":http://www.freedesktop.org/software/systemd/man/systemd.service.html of type _simple_ for each of the components.
|
||||
p. Supervisor v3.0b1 or later is required, due to the _stopasgroup_ parameter. Without this parameter, Supervisor will not be able to stop the Stackdump components properly as they're being executed from a script.
|
||||
|
||||
Yet another option for those using newer Linux distributions is to create native "systemd service definitions":http://www.freedesktop.org/software/systemd/man/systemd.service.html of type _simple_ for each of the components.
|
||||
|
||||
h2. Maintenance
|
||||
|
||||
@@ -203,6 +220,7 @@ Stackdump leverages several open-source projects to do various things, including
|
||||
* "markdown":http://pypi.python.org/pypi/Markdown for rendering comments
|
||||
* "mathjax":http://www.mathjax.org/ for displaying mathematical expressions properly
|
||||
* "httplib2":http://code.google.com/p/httplib2/ as a dependency of pysolr
|
||||
* "requests":https://github.com/kennethreitz/requests/ as a dependency of pysolr
|
||||
* "Apache Solr":http://lucene.apache.org/solr/ for search functionality
|
||||
|
||||
h2. Things not supported... yet
|
||||
|
||||
@@ -35,5 +35,5 @@ else
|
||||
# shift off the command name so we don't pass it on
|
||||
shift
|
||||
|
||||
$SCRIPT_DIR/start_python.sh $command "$@"
|
||||
$SCRIPT_DIR/start_python3.sh $command "$@"
|
||||
fi
|
||||
|
||||
@@ -827,7 +827,12 @@ def _rewrite_html(html, app_url_root, sites_by_urls):
|
||||
internal_link = False
|
||||
url = t.get('href', None)
|
||||
if url:
|
||||
try:
|
||||
host = urllib2.Request(url).get_host()
|
||||
except ValueError:
|
||||
# invalid URL or local anchor, leaving as-is
|
||||
pass
|
||||
else:
|
||||
site = sites_by_urls.get(host, None)
|
||||
if site:
|
||||
# rewrite this URL for stackdump
|
||||
@@ -836,18 +841,17 @@ def _rewrite_html(html, app_url_root, sites_by_urls):
|
||||
question_id = question_id.groupdict()['id']
|
||||
url = '%s%s/%s' % (app_url_root, site.key, question_id)
|
||||
t.set('href', url)
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
internal_link = True
|
||||
|
||||
answer_id = SE_ANSWER_ID_RE.search(url)
|
||||
if answer_id:
|
||||
answer_id = answer_id.groupdict()['id']
|
||||
url = '%s%s/a/%s' % (app_url_root, site.key, answer_id)
|
||||
t.set('href', url)
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
internal_link = True
|
||||
|
||||
if not internal_link:
|
||||
if internal_link:
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
else:
|
||||
t.set('class', t.get('class', '') + ' external-link')
|
||||
|
||||
# get a string back
|
||||
|
||||
1
python/src/stackdump/commands/default_settings.py
Symbolic link
1
python/src/stackdump/commands/default_settings.py
Symbolic link
@@ -0,0 +1 @@
|
||||
../default_settings.py
|
||||
@@ -1,47 +1,142 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script downloads the sites RSS file and associated logos from the net.
|
||||
|
||||
import urllib
|
||||
import tarfile
|
||||
import urllib.request
|
||||
from xml.etree import ElementTree
|
||||
import os
|
||||
import sys
|
||||
def printf(format, *args):
|
||||
sys.stdout.write(format % args)
|
||||
from shutil import copy
|
||||
import os, ssl, fnmatch
|
||||
from optparse import OptionParser
|
||||
from xml.etree import ElementTree
|
||||
import elasticsearch
|
||||
|
||||
import settings
|
||||
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
|
||||
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
|
||||
from sqlobject.sqlbuilder import Delete, Insert
|
||||
from sqlobject.styles import DefaultStyle
|
||||
from pysolr import Solr, SolrError
|
||||
|
||||
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
|
||||
getattr(ssl, '_create_unverified_context', None)):
|
||||
ssl._create_defat_https_context = ssl._create_unverified_context
|
||||
|
||||
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||
|
||||
sites_file_path = os.path.join(script_dir, ''
|
||||
'../../../../data/')
|
||||
# ensure the data directory exists
|
||||
# download the sites RSS file
|
||||
|
||||
if not os.path.exists(os.path.dirname(sites_file_path)):
|
||||
os.mkdir(os.path.dirname(sites_file_path))
|
||||
|
||||
# download the sites RSS file
|
||||
print 'Downloading StackExchange sites RSS file...',
|
||||
urllib.urlretrieve('http://stackexchange.com/feeds/sites', sites_file_path)
|
||||
print 'done.'
|
||||
print('Downloading StackExchange sites XML file...')
|
||||
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
print('done.')
|
||||
|
||||
print ''
|
||||
print('')
|
||||
|
||||
# parse sites RSS file and download logos
|
||||
logos_dir_path = os.path.join(script_dir, '../../../media/images/logos')
|
||||
|
||||
|
||||
# parse sites RSS file and download logosc
|
||||
images_dir_path = os.path.join(script_dir, '../../../media/images')
|
||||
print(os.listdir(images_dir_path))
|
||||
logos_dir_path = os.path.join(images_dir_path, 'logos48')
|
||||
if not os.path.exists(logos_dir_path):
|
||||
os.mkdir(logos_dir_path)
|
||||
icons_dir_path = os.path.join(images_dir_path, 'icons')
|
||||
if not os.path.exists(icons_dir_path):
|
||||
os.mkdir(icons_dir_path)
|
||||
badges_dir_path = os.path.join(images_dir_path, 'badges')
|
||||
if not os.path.exists(badges_dir_path):
|
||||
os.mkdir(badges_dir_path)
|
||||
|
||||
with open(sites_file_path) as f:
|
||||
with open(sites_path) as f:
|
||||
sites_file = ElementTree.parse(f)
|
||||
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
|
||||
|
||||
for entry in entries:
|
||||
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text.encode('ascii', 'ignore')
|
||||
sites = sites_file.findall('row')
|
||||
# print(rows[0].attrib)
|
||||
|
||||
for site in sites:
|
||||
site_title = site.attrib['LongName']
|
||||
site_name = site.attrib['Name']
|
||||
# extract the key from the url - remove the http:// and .com
|
||||
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
|
||||
if site_key.startswith('http://'):
|
||||
site_key = site_key[len('http://'):]
|
||||
if site_key.endswith('.com'):
|
||||
site_key = site_key[:-len('.com')]
|
||||
if site_key.endswith('.stackexchange'):
|
||||
site_key = site_key[:-len('.stackexchange')]
|
||||
site_key = site.attrib['TinyName']
|
||||
site_url = site.attrib['Url'][8:]
|
||||
logo_url = site.attrib['ImageUrl']
|
||||
icon_url = site.attrib['IconUrl']
|
||||
badge_url = site.attrib['BadgeIconUrl']
|
||||
|
||||
print 'Downloading logo for %s...' % entry_title,
|
||||
urllib.urlretrieve('http://sstatic.net/%s/img/icon-48.png' % site_key, os.path.join(logos_dir_path, '%s.png' % site_key))
|
||||
print 'done.'
|
||||
|
||||
site_vars = (site_url, site_key, site_name, site_title)
|
||||
# print(site_vars)
|
||||
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
|
||||
try:
|
||||
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
||||
if not os.path.exists(logo_file):
|
||||
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
||||
except Exception as e:
|
||||
print('Failed download logo for %s...' % site_title, str(e))
|
||||
|
||||
try:
|
||||
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
||||
except:
|
||||
print('Failed download ico for %s...' % site_title, icon_url)
|
||||
|
||||
try:
|
||||
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
|
||||
except:
|
||||
printf('Failed download badge for %s...' % site_title)
|
||||
|
||||
site_files = []
|
||||
print('Key: ' + site_url)
|
||||
for root, dirs, files in os.walk(se_dir):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, site_url + '*'):
|
||||
print('Match: ' + os.path.join(root, name))
|
||||
site_files.append(os.path.join(root, name))
|
||||
|
||||
sites_data = sites_file_path
|
||||
for site_file in site_files:
|
||||
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + 'xml'
|
||||
os.makedirs(dst, exist_ok=True)
|
||||
os.chdir(dst)
|
||||
os.system('tar xzf '+site_file)
|
||||
print('Data: ' + site_file)
|
||||
|
||||
def prepare_site(xml_root, dump_date, site_key):
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
print('The given XML root path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# connect to the database
|
||||
print('Connecting to the Stackdump database...')
|
||||
conn_str = settings.DATABASE_CONN_STR
|
||||
sqlhub.processConnection = connectionForURI(conn_str)
|
||||
print('Connected.\n')
|
||||
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %pro'
|
||||
'g [options] xml_root_dir')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
if len(cmd_args) < 1:
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
|
||||
|
||||
911
python/src/stackdump/commands/import_recent.py
Normal file
911
python/src/stackdump/commands/import_recent.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,16 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# This script takes extracted site files and inserts them into the database.
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import xml.sax
|
||||
from datetime import datetime
|
||||
import re
|
||||
import urllib2
|
||||
import urllib
|
||||
import socket
|
||||
import tempfile
|
||||
import traceback
|
||||
@@ -25,13 +23,7 @@ from pysolr import Solr, SolrError
|
||||
|
||||
from stackdump.models import Site, Badge, User
|
||||
from stackdump import settings
|
||||
|
||||
try:
|
||||
# For Python < 2.6 or people using a newer version of simplejson
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
# For Python >= 2.6
|
||||
import json
|
||||
import json
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
|
||||
@@ -79,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
||||
|
||||
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not insert this, so ignore the row
|
||||
print('Exception: ' + str(e))
|
||||
import traceback
|
||||
@@ -112,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
|
||||
d['userId'] = int(attrs.get('UserId', 0))
|
||||
d['name'] = attrs.get('Name', '')
|
||||
d['date'] = attrs.get('Date')
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not parse this, so ignore the row completely
|
||||
self.cur_props = None
|
||||
print('Exception: ' + str(e))
|
||||
@@ -146,7 +138,7 @@ class CommentContentHandler(BaseContentHandler):
|
||||
d['creationDate'] = attrs.get('CreationDate')
|
||||
d['userId'] = int(attrs.get('UserId', 0))
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not parse this, so ignore the row completely
|
||||
self.cur_props = None
|
||||
print('Exception: ' + str(e))
|
||||
@@ -196,7 +188,7 @@ class UserContentHandler(BaseContentHandler):
|
||||
d['upVotes'] = int(attrs.get('UpVotes', 0))
|
||||
d['downVotes'] = int(attrs.get('DownVotes', 0))
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not parse this, so ignore the row completely
|
||||
self.cur_props = None
|
||||
print('Exception: ' + str(e))
|
||||
@@ -243,7 +235,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
if hasattr(obj, 'isoformat'):
|
||||
return obj.isoformat()
|
||||
else:
|
||||
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj))
|
||||
raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name != 'row':
|
||||
@@ -300,7 +292,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
d['comments'] = [ ]
|
||||
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not parse this, so ignore the row completely
|
||||
self.cur_props = None
|
||||
print('Exception: ' + str(e))
|
||||
@@ -346,7 +338,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
# remove orphan answers from the orphan list
|
||||
del self.orphan_answers[d['id']]
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not insert this, so ignore the row
|
||||
print('Exception: ' + str(e))
|
||||
import traceback
|
||||
@@ -376,7 +368,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
# question is complete, store it.
|
||||
questions_to_commit.append(self.finalise_question(q))
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not serialise and insert this question, so ignore it
|
||||
print('Exception: ' + str(e))
|
||||
import traceback
|
||||
@@ -507,7 +499,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
if q['acceptedAnswerId'] in post_ids:
|
||||
question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
|
||||
else:
|
||||
print 'Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], )
|
||||
print('Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], ))
|
||||
question_obj['creationDate'] = q['creationDate']
|
||||
question_obj['score'] = q['score']
|
||||
question_obj['viewCount'] = q['viewCount']
|
||||
@@ -538,13 +530,17 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
|
||||
By default, they are committed immediately. Set the ``commit`` argument
|
||||
to False to disable this behaviour.
|
||||
|
||||
This function will loop if a SolrError is encountered to allow the user
|
||||
to retry the commit without having to start again from the beginning,
|
||||
e.g. if the Solr instance stops responding.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
self.solr.add(questions, commit=commit)
|
||||
break
|
||||
except SolrError, e:
|
||||
print('An exception occurred while committing questions - ')
|
||||
except SolrError:
|
||||
print('A Solr error occurred while committing questions - ')
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
print('')
|
||||
while True:
|
||||
@@ -557,6 +553,8 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
break
|
||||
else:
|
||||
raise
|
||||
except:
|
||||
raise
|
||||
|
||||
def commit_all_questions(self):
|
||||
"""
|
||||
@@ -574,7 +572,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
||||
# question is complete, store it.
|
||||
questions_to_commit.append(self.finalise_question(q))
|
||||
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# could not serialise and insert this question, so ignore it
|
||||
print('Exception: ' + str(e))
|
||||
import traceback
|
||||
@@ -643,8 +641,7 @@ def get_file_path(dir_path, filename):
|
||||
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||
|
||||
|
||||
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
site_base_url, answer_yes=False):
|
||||
def import_site(xml_root, dump_date, import_key):
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
@@ -657,18 +654,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
sqlhub.processConnection = connectionForURI(conn_str)
|
||||
print('Connected.\n')
|
||||
|
||||
# connect to solr
|
||||
print('Connecting to solr...')
|
||||
solr = Solr(settings.SOLR_URL, assume_clean=True)
|
||||
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
|
||||
try:
|
||||
solr._send_request('GET', 'admin/ping')
|
||||
except socket.error, e:
|
||||
print('Failed to connect to solr - error was: %s' % str(e))
|
||||
print('Aborting.')
|
||||
sys.exit(2)
|
||||
print('Connected.\n')
|
||||
|
||||
# ensure required tables exist
|
||||
print("Creating tables if they don't exist...")
|
||||
Site.createTable(ifNotExists=True)
|
||||
@@ -677,117 +662,30 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
print('Created.\n')
|
||||
|
||||
# SITE INFO
|
||||
# only look if they were not specified at the command line; also only if
|
||||
# readme.txt exists (they don't in dumps after Aug 2012)
|
||||
readme_path = get_file_path(xml_root, 'readme.txt')
|
||||
if not (site_name and dump_date) and readme_path:
|
||||
# get the site name from the first line of readme.txt. This could be fragile.
|
||||
with open(readme_path, 'r') as f:
|
||||
site_readme_desc = f.readline().strip()
|
||||
# only look if they were not specified at the command line;
|
||||
|
||||
# assume if there's a colon in the name, the name part is before, and the date
|
||||
# part is after.
|
||||
if ':' in site_readme_desc:
|
||||
readme_site_name, readme_dump_date = site_readme_desc.split(':')
|
||||
readme_site_name = readme_site_name.strip()
|
||||
readme_dump_date = readme_dump_date.strip()
|
||||
else:
|
||||
readme_site_name = site_readme_desc
|
||||
readme_dump_date = None
|
||||
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||
|
||||
# if the phrase ' - Data Dump' is in the readme site name, remove it
|
||||
i = readme_site_name.rfind(' - Data Dump')
|
||||
if i >= 0:
|
||||
readme_site_name = readme_site_name[:i].strip()
|
||||
|
||||
if not site_name:
|
||||
site_name = readme_site_name
|
||||
if not dump_date:
|
||||
dump_date = readme_dump_date
|
||||
|
||||
# look for the site in the sites RSS file using the base_url with the id in RSS
|
||||
# scrub the URL scheme off the base_url
|
||||
if site_base_url:
|
||||
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
|
||||
# can strip off other bits in the URL that we don't want
|
||||
if '://' not in site_base_url:
|
||||
site_base_url = 'http://%s' % site_base_url
|
||||
site_base_url = urllib2.Request(site_base_url).get_host()
|
||||
|
||||
# attempt to get more information from the sites RSS cache
|
||||
if site_base_url and not (site_name and site_desc and site_key):
|
||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||
if os.path.exists(sites_file_path):
|
||||
with open(sites_file_path) as f:
|
||||
with open(sites_path) as f:
|
||||
sites_file = ElementTree.parse(f)
|
||||
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
|
||||
sites = sites_file.findall('row')
|
||||
# print(rows[0].attrib)
|
||||
|
||||
for entry in entries:
|
||||
entry_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text
|
||||
if '://' in entry_base_url:
|
||||
entry_base_url = urllib2.Request(entry_base_url).get_host()
|
||||
if site_base_url == entry_base_url:
|
||||
# this entry matches the detected site id
|
||||
if not site_key:
|
||||
# extract the key from the url
|
||||
rss_site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
|
||||
# remove the URL scheme
|
||||
if '://' in rss_site_key:
|
||||
rss_site_key = rss_site_key[rss_site_key.find('://')+3:]
|
||||
# remove the TLD
|
||||
if rss_site_key.rfind('.') >= 0:
|
||||
rss_site_key = rss_site_key[:rss_site_key.rfind('.')]
|
||||
# remove the .stackexchange bit
|
||||
if '.stackexchange' in rss_site_key:
|
||||
rss_site_key = rss_site_key[:rss_site_key.find('.stackexchange')]
|
||||
for site in sites:
|
||||
site_title = site.attrib['LongName']
|
||||
site_name = site.attrib['Name']
|
||||
# extract the key from the url - remove the http:// and .com
|
||||
site_key = site.attrib['TinyName']
|
||||
site_url = site.attrib['Url'][8:]
|
||||
logo_url = site.attrib['ImageUrl']
|
||||
icon_url = site.attrib['IconUrl']
|
||||
badge_url = site.attrib['BadgeIconUrl']
|
||||
|
||||
site_key = rss_site_key
|
||||
|
||||
if not site_name:
|
||||
site_name = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
|
||||
if not site_desc:
|
||||
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
|
||||
|
||||
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (
|
||||
site_name.encode('ascii', 'ignore') if site_name else None,
|
||||
site_key,
|
||||
site_desc.encode('ascii', 'ignore') if site_desc else None,
|
||||
dump_date,
|
||||
site_base_url
|
||||
)
|
||||
|
||||
# the base URL is optional.
|
||||
if not (site_name and site_key and site_desc and dump_date):
|
||||
print 'Could not get all the details for the site.'
|
||||
print 'Use command-line parameters to specify the missing details (listed as None).'
|
||||
sys.exit(1)
|
||||
|
||||
# prevent importing sites with keys that clash with method names in the app,
|
||||
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
|
||||
if site_key in ('search', 'import', 'media', 'licenses'):
|
||||
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
|
||||
print 'Use the --site-key parameter to specify an alternate site key.'
|
||||
sys.exit(2)
|
||||
|
||||
# confirm site details with user to make sure we don't accidentally overwrite
|
||||
# another site.
|
||||
if not answer_yes:
|
||||
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
|
||||
confirm_answer = raw_input(confirm_prompt)
|
||||
if confirm_answer != 'yes':
|
||||
print 'Import aborted on user request.'
|
||||
sys.exit(3)
|
||||
|
||||
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
|
||||
# this import process is aborted. Solr doesn't have the concept of transactions
|
||||
# like databases do, so without a rollback, we'll be committing the previously
|
||||
# uncommitted entries plus the newly imported ones.
|
||||
#
|
||||
# This also means multiple dataproc processes cannot occur concurrently. If you
|
||||
# do the import will be silently incomplete.
|
||||
print('Clearing any uncommitted entries in solr...')
|
||||
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
|
||||
print('Cleared.\n')
|
||||
if (import_key != '') and (import_key != site_key):
|
||||
continue
|
||||
else:
|
||||
print('site_name: '+site_name)
|
||||
|
||||
# check if site is already in database; if so, purge the data.
|
||||
site = list(Site.select(Site.q.key==site_key))
|
||||
@@ -903,19 +801,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
if len(cmd_args) < 1:
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||
cmd_options.site_desc, cmd_options.site_key,
|
||||
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||
import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
|
||||
1
python/src/stackdump/commands/settings.py
Symbolic link
1
python/src/stackdump/commands/settings.py
Symbolic link
@@ -0,0 +1 @@
|
||||
../settings.py
|
||||
1
questions (1).json
Normal file
1
questions (1).json
Normal file
File diff suppressed because one or more lines are too long
1
questions.json
Normal file
1
questions.json
Normal file
File diff suppressed because one or more lines are too long
BIN
schema.xlsx
Normal file
BIN
schema.xlsx
Normal file
Binary file not shown.
54
start_python3.sh
Executable file
54
start_python3.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
##
|
||||
# This script attempts to find a version of Python on the system PATH, and
|
||||
# checks that it is 2.5+.
|
||||
#
|
||||
# A alternate Python command can be specified in a file named PYTHON_CMD in this
|
||||
# script's directory. This path will override any lookup on the system PATH.
|
||||
##
|
||||
|
||||
# FUNCTIONS
|
||||
function checkPythonVersion {
|
||||
if [ ! -z "$1" ]
|
||||
then
|
||||
PYTHON_VER_MAJOR=`echo $1 | cut -d "." -f 1`
|
||||
PYTHON_VER_MINOR=`echo $1 | cut -d "." -f 2`
|
||||
|
||||
if [ $PYTHON_VER_MAJOR -eq "3" -a $PYTHON_VER_MINOR -ge "5" ]
|
||||
then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# MAIN
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
PYTHON_CMD=python3
|
||||
|
||||
# if there is a PYTHON_CMD file in the script directory, use that instead
|
||||
if [ -e "$SCRIPT_DIR/PYTHON_CMD" ]
|
||||
then
|
||||
PYTHON_CMD=`cat "$SCRIPT_DIR/PYTHON_CMD"`
|
||||
fi
|
||||
|
||||
if [ ! -z "`which "$PYTHON_CMD" 2>/dev/null`" ]
|
||||
then
|
||||
# check if Python is the right version
|
||||
PYTHON_VER=`"$PYTHON_CMD" -V 2>&1 | cut -d " " -f 2`
|
||||
checkPythonVersion "$PYTHON_VER"
|
||||
if [ $? == 1 ]
|
||||
then
|
||||
echo "Using Python `which "$PYTHON_CMD"`"
|
||||
|
||||
# execution ends here if Python is found
|
||||
PYTHONPATH=$SCRIPT_DIR/pyth3/packages:$SCRIPT_DIR/python/src:$PYTHONPATH
|
||||
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
|
||||
exit $?
|
||||
fi
|
||||
fi
|
||||
|
||||
# if we get here, it means the right version of Python was not found
|
||||
echo 'No suitable version of Python was found. Python 2.5 or later is required.'
|
||||
Reference in New Issue
Block a user