1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-12-17 13:23:25 +00:00

34 Commits
v1.2 ... master

Author SHA1 Message Date
djohnlewis
ea8aefcaf7 xml output 2021-06-13 15:23:02 +01:00
djohnlewis
a10a6d1e4d Python update 2021-06-13 15:19:51 +01:00
djohnlewis
4616d04c56 Python update 2021-06-10 09:46:47 +01:00
djohnlewis
f53efd1422 image download 2021-05-24 20:14:14 +01:00
djohnlewis
20693a8764 added gitigore 2021-05-24 09:15:01 +01:00
Sornram Kmut'nb
f20e281d3d Update README.textile 2016-07-16 23:07:49 +07:00
Sornram Kmut'nb
0f16cd4bce Update README.textile 2016-07-16 23:07:14 +07:00
Sornram Kmut'nb
2d27d3efe4 Update README.textile 2016-07-16 23:05:35 +07:00
Sornram Kmut'nb
3f29d45964 Update README.textile 2016-07-16 23:03:24 +07:00
Skylar Ittner
dcc7203c97 Updated icon retrieval URL to fix 404 errors 2016-04-21 01:23:32 +00:00
Sam Lai
6a7b8ea432 Updated the README to reflect the new resource requirements needed for the latest StackOverflow data set. 2015-01-06 22:15:30 +00:00
Samuel Lai
c020660479 Bad URLs are left as bad external links, rather than re-written as bad
internal links.

This doesn't make much difference, but I think it is nicer to assume bad
URLs are external, rather than internal.
2014-05-17 19:34:16 +10:00
Alexei Baboulevitch
7f6ed7b438 Fixed an uncaught exception caused by broken URLs.
Examples of fixed pages: photo/11689, stackoverflow/315911
2014-05-12 14:35:28 +02:00
Samuel Lai
a06d2a4c55 Added minimum version information in more places in README. 2014-04-26 23:18:01 +10:00
Samuel Lai
55bec19665 Updated README with ideal Python version information. 2014-04-26 23:09:45 +10:00
Samuel Lai
a59e3b59d0 Updated readme with a few minor changes. 2014-04-25 23:51:42 +10:00
Samuel Lai
40121f2600 Added tag v1.3.1 for changeset 321f5e2fa176 2014-03-04 16:28:44 +11:00
Samuel Lai
db026d2ccc Fixed Supervisor config file example so it actually stops the components properly. 2014-03-04 16:19:30 +11:00
Samuel Lai
ae6e10e6c4 Fixed a bug where import_site will loop forever if a non-SolrError exception is encountered. 2014-03-04 15:47:11 +11:00
Samuel Lai
f79df598d3 Added tag v1.3 for changeset 5c1ae2e2f71a 2014-03-04 15:06:01 +11:00
Samuel Lai
4d6343584a Minor README tweaks. 2014-03-03 17:07:26 +11:00
Samuel Lai
9d1d6b135a Grrr. More textile issues. 2014-02-27 22:02:04 +11:00
Samuel Lai
96b06f7b35 Oops, textile syntax mistake. 2014-02-27 22:00:48 +11:00
Samuel Lai
28d79ea089 Added notes on using supervisor with stackdump. 2014-02-27 21:58:22 +11:00
Samuel Lai
ce7edf1ca0 Minor README tweaks. 2014-02-27 20:44:55 +11:00
Samuel Lai
4254f31859 Updated the README for the next release.
Fixes #8 by updating the URL to the data dumps.
2014-02-27 20:39:32 +11:00
Samuel Lai
c11fcfacf6 Fixes #9. Added ability for import_site command to resume importing if the connection to Solr is lost and restored. 2014-02-27 20:12:53 +11:00
Samuel Lai
7764f088c2 Added a setting to disable the rewriting of links and image URLs. 2014-02-27 18:52:25 +11:00
Samuel Lai
a4c6c2c7ba Certain ignored post type IDs are now recognised by the error handler and messages printed as such. 2014-02-27 18:13:04 +11:00
Samuel Lai
01f9b10c27 Fixed #7. Turns out post IDs are not unique across sites.
This change will require re-indexing of all sites unfortunately. On the upside, more questions to browse!
2014-02-27 17:57:34 +11:00
Sam
cdb93e6f68 Merged changes. 2014-02-16 01:04:19 +11:00
Sam
0990e00852 Added an original copy of pysolr.py so the custom changes can be worked out. 2014-02-16 01:03:05 +11:00
Samuel Lai
92e359174a Added some notes on importing StackOverflow on Windows. 2013-12-12 17:29:55 +11:00
Samuel Lai
c521fc1627 Added tag v1.2 for changeset 240affa260a1 2013-11-30 18:06:37 +11:00
22 changed files with 2502 additions and 323 deletions

31
.gitignore vendored Normal file
View File

@@ -0,0 +1,31 @@
^JAVA_CMD$
^PYTHON_CMD$
.DS_Store
# ignore any data
data
# ignore working bytecode
\.class$
\.pyc$
^datadump/*
# ignore test and tutorial directories
test/*$
tests/*$
testsuite/*$
tutorial/*$
# Solr/Jetty
^java/solr/server/solr-webapp/*
^java/solr/server/logs/*
# ignore the downloaded logos
^python/media/images/logos48
^python/media/images/icons
^python/media/images/badges
# PyCharm project files
^.idea/

3
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/stackdump.iml" filepath="$PROJECT_DIR$/.idea/stackdump.iml" />
</modules>
</component>
</project>

8
.idea/stackdump.iml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@@ -1,3 +1,8 @@
Tutorial (Thai language) : http://www.youtube.com/watch?v=jHEBlHxEKeM
"http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/":http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/
- - - - - - - - - - - - - - - - - - - ORIGINAL README - - - - - - - - - - - - - - - - - - -
h1. Stackdump - an offline browser for StackExchange sites.
Stackdump was conceived for those who work in environments that do not have easy access to the StackExchange family of websites. It allows you to host a read-only instance of the StackExchange sites locally, accessible via a web browser.
@@ -12,13 +17,15 @@ h2. Screenshots
h2. System Requirements
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). It leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). There have been some reported issues with older versions of Python, so the ideal version to run is v2.7.6 (the latest 2.x version, as of writing). Stackdump also leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
Besides that, there are no OS-dependent dependencies and should work on any platform that Python and Java run on (although it only comes bundled with Linux scripts at the moment). It was, however, developed and tested on CentOS 5 running Python 2.7 and JRE 6 update 27.
You will also need "7-zip":http://www.7-zip.org/ to extract the data dump files, but Stackdump does not use it directly so you can perform the extraction on another machine first.
It is recommended that Stackdump be run on a system with at least 3GB of RAM, particularly if you intend to import StackOverflow into Stackdump. Apache Solr requires a fair bit of memory during the import process. It should also have a fair bit of space available; having at least roughly the space used by the raw, extracted, data dump XML files is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
The amount of memory required for Stackdump depends on which dataset you want to import. For most datasets, at least 3GB of RAM is preferable. If you want to import StackOverflow, you must use a 64-bit operating system and a 64-bit version of Python, and also have at least 6GB of RAM available (or swap). If you do not have enough RAM available, the import process will likely fail with a _MemoryError_ message at some point.
Make sure you have enough disk space too - having at least roughly the space used by the raw, extracted, data dump XML files available is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
Finally, Stackdump has been tested and works in the latest browsers (IE9, FF10+, Chrome, Safari). It degrades fairly gracefully in older browsers, although some will have rendering issues, e.g. IE8.
@@ -28,10 +35,16 @@ Version 1.1 fixes a few bugs, the major one being the inability to import the 20
Because changes have been made to the search schema and the search indexer has been upgraded (to Solr 4.5), all data will need to be re-indexed. Therefore there is no upgrade path; follow the instructions below to set up Stackdump again. It is recommended to install this new version in a new directory, instead of overwriting the existing one.
h2. Changes and upgrading from v1.1 to v1.2.
h2. Changes and upgrading from v1.1 to v1.2
The major change in the v1.2 release are improvements to the speed of importing data. There are some other smaller changes, including new PowerShell scripts to start and manage Stackdump on Windows as well as a few bug fixes when running on Windows. The search indexing side of things has not changed, therefore data imported using v1.1 will continue to work in v1.2. _Data from older versions however, needs to be re-indexed. See the above section on upgrading to v1.1 for more details._
h2. Changes and upgrading from v1.2 to v1.3
v1.3 is primarily bugfix release, for a fairly serious bug. It turns out Stackdump has been subtly overwriting questions as more sites are imported because it assumed post IDs were unique across all sites, when they in fact were not. This meant as more sites were imported, the previous sites started to lose questions. The fix required a change to search index, therefore *the data directory will need to be deleted and all data will need to be re-imported after installing this version*. Thanks to @yammesicka for reporting the issue.
Other changes include a new setting to allow disabling the link and image URL rewriting, and a change to the @import_site@ command so it doesn't bail immediately if there is a Solr connection issue - it will prompt and allow resumption after the connection issue has been resolved.
h3. Importing the StackOverflow data dump, September 2013
The StackOverflow data dump has grown significantly since I started this project back in 2011. With the improvements in v1.2, on a VM with two cores and 4GB of RAM running CentOS 5.7 on a single, standard hard drive containing spinning pieces of metal,
@@ -43,19 +56,27 @@ The StackOverflow data dump has grown significantly since I started this project
In total, the StackOverflow data dump has *15,933,529 posts* (questions and answers), *2,332,403 users* and a very large number of comments.
I attempted this on a similarly spec'ed Windows 7 64-bit VM as well - 23 hours later and it is still trying to process the comments. The SQLite, Python or just disk performance is very poor for some reason. Therefore, if you intend on importing StackOverflow, I would advise you to run Stackdump on Linux instead. The smaller sites all complete without a reasonable time though, and there are no perceptible issues with performance as far as I'm aware on Windows.
h3. Reports on importing the StackOverflow data dump, September 2014
Due to the growth of the dataset, the import process now requires at least 6GB of RAM. This also means you must use a 64-bit operating system and a 64-bit version of Python.
h2. Setting up
Stackdump was designed for offline environments or environments with poor internet access, therefore it is bundled with all the dependencies it requires (with the exception of Python, Java and 7-zip).
As long as you have:
* "Python":http://python.org/download/,
* "Java":http://java.com/en/download/manual.jsp,
* "Python":http://python.org/download/, version 2.5 or later but not version 3 (tested with v2.7.6),
* "Java":http://java.com/en/download/manual.jsp, version 6 (1.6) or later,
* "Stackdump":https://bitbucket.org/samuel.lai/stackdump/downloads,
* the "StackExchange Data Dump":http://www.clearbits.net/creators/146-stack-exchange-data-dump (Note: this is only available as a torrent), and
* the "StackExchange Data Dump":https://archive.org/details/stackexchange (download the sites you wish to import - note that StackOverflow is split into 7 archive files; only Comments, Posts and Users are required but after extraction the files need to be renamed to Comments.xml, Posts.xml and Users.xml respectively), and
* "7-zip":http://www.7-zip.org/ (needed to extract the data dump files)
...you should be able to get an instance up and running.
If you are using a 64-bit operating system, get the 64-bit version of Python.
To provide a better experience, Stackdump can use the RSS feed content to pre-fill some of the required details during the import process, as well as to display the site logos in the app. Stackdump comes bundled with a script that downloads and places these bits in the right places. If you're in a completely offline environment however, it may be worth running this script on a connected box first.
h3. Windows users
@@ -66,7 +87,7 @@ Remember to set your PowerShell execution policy to at least @RemoteSigned@ firs
h3. Extract Stackdump
Stackdump was to be self-contained, so to get it up and running, simply extract the Stackdump download to an appropriate location.
Stackdump was designed to be self-contained, so to get it up and running, simply extract the Stackdump download archive to an appropriate location.
h3. Verify dependencies
@@ -79,7 +100,7 @@ bq. If you're using Java 7 on Linux and you see an error similar to the followin
this is because you have SELinux enabled. You will need to tell SELinux to allow Java to run by using the following command as root (amending the path as necessary) -
@chcon -t textrel_shlib_t /opt/jre1.7.0_40/lib/i386/server/libjvm.so@
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3).
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3). Ideally this should be v2.7.6 or later as there have been some reported issues with earlier versions.
If you would rather not put these versions in the PATH (e.g. you don't want to override the default version of Python in your Linux distribution), you can tell Stackdump which Java and/or Python to use explicitly by creating a file named @JAVA_CMD@ or @PYTHON_CMD@ respectively in the Stackdump root directory, and placing the path to the executable in there.
@@ -108,15 +129,15 @@ To start the import process, execute the following command -
@stackdump_dir/manage.sh import_site --base-url site_url --dump-date dump_date path_to_xml_files@
... where site_url is the URL of the site you're importing, e.g. __android.stackexchange.com__; dump_date is the date of the data dump you're importing, e.g. __August 2012__, and finally path_to_xml_files is the path to the XML files you just extracted. The dump_date is a text string that is shown in the app only, so it can be in any format you want.
... where @site_url@ is the URL of the site you're importing, e.g. __android.stackexchange.com__; @dump_date@ is the date of the data dump you're importing, e.g. __August 2012__, and finally @path_to_xml_files@ is the path to the directory containing the XML files that were just extracted. The @dump_date@ is a text string that is shown in the app only, so it can be in any format you want.
For example, to import the August 2012 data dump of the Android StackExchange site, you would execute -
For example, to import the August 2012 data dump of the Android StackExchange site, with the files extracted into @/tmp/android@, you would execute -
@stackdump_dir/manage.sh import_site --base-url android.stackexchange.com --dump-date "August 2012" /tmp/android@
It is normal to get messages about unknown PostTypeIds and missing comments and answers. These errors are likely due to those posts being hidden via moderation.
This can take anywhere between a minute to 10 hours or more depending on the site you're importing. As a rough guide, __android.stackexchange.com__ took a minute on my VM, while __stackoverflow.com__ took just over 10 hours.
This can take anywhere between a minute to 20 hours or more depending on the site you're importing. As a rough guide, __android.stackexchange.com__ took a minute on my VM, while __stackoverflow.com__ took just under 24 hours.
Repeat these steps for each site you wish to import. Do not attempt to import multiple sites at the same time; it will not work and you may end up with half-imported sites.
@@ -130,19 +151,53 @@ To start Stackdump, execute the following command -
... and visit port 8080 on that machine. That's it - your own offline, read-only instance of StackExchange.
If you need to change the port that it runs on, modify @stackdump_dir/python/src/stackdump/settings.py@ and restart the app.
If you need to change the port that it runs on, or modify other settings that control how Stackdump works; see the 'Optional configuration' section below for more details.
The aforementioned @settings.py@ file also contains some other settings that control how Stackdump works.
Both the search indexer and the app need to be running for Stackdump to work.
h2. Optional configuration
There are a few settings for those who like to tweak. There's no need to adjust them normally though; the default settings should be fine.
The settings file is located in @stackdump_dir/python/src/stackdump/settings.py@. The web component will need to be restarted after changes have been made for them to take effect.
* *SERVER_HOST* - the network interface to run the Stackdump web app on. Use _'0.0.0.0'_ for all interfaces, or _'127.0.0.1'_ for localhost only. By default, it runs on all interfaces.
* *SERVER_PORT* - the port to run the Stackdump web app on. The default port is _8080_.
* *SOLR_URL* - the URL to the Solr instance. The default assumes Solr is running on the same system. Change this if Solr is running on a different system.
* *NUM_OF_DEFAULT_COMMENTS* - the number of comments shown by default for questions and answers before the remaining comments are hidden (and shown when clicked). The default is _3_ comments.
* *NUM_OF_RANDOM_QUESTIONS* - the number of random questions shown on the home page of Stackdump and the site pages. The default is _3_ questions.
* *REWRITE_LINKS_AND_IMAGES* - by default, all links are rewritten to either point internally or be marked as an external link, and image URLs are rewritten to point to a placeholder image. Set this setting to _False_ to disable this behaviour.
h2. Running Stackdump as a service
Stackdump comes bundled with some init.d scripts as well which were tested on CentOS 5. These are located in the @init.d@ directory. To use these, you will need to modify them to specify the path to the Stackdump root directory and the user to run under.
Both the search indexer and the app need to be running for Stackdump to work.
Another option is to use "Supervisor":http://supervisord.org/ with a simple configuration file, e.g.,
bc.. [program:stackdump-solr]
command=/path/to/stackdump/start_solr.sh
priority=900
user=stackdump_user
stopasgroup=true
stdout_logfile=/path/to/stackdump/solr_stdout.log
stderr_logfile=/path/to/stackdump/solr_stderr.log
[program:stackdump-web]
command=/path/to/stackdump/start_web.sh
user=stackdump_user
stopasgroup=true
stdout_logfile=/path/to/stackdump/web_stdout.log
stderr_logfile=/path/to/stackdump/web_stderr.log
p. Supervisor v3.0b1 or later is required, due to the _stopasgroup_ parameter. Without this parameter, Supervisor will not be able to stop the Stackdump components properly as they're being executed from a script.
Yet another option for those using newer Linux distributions is to create native "systemd service definitions":http://www.freedesktop.org/software/systemd/man/systemd.service.html of type _simple_ for each of the components.
h2. Maintenance
Stackdump stores all its data in the @data@ directory under its root directory. If you want to start fresh, just stop the app and the search indexer, delete that directory and restart the app and search indexer.
To delete certain sites from Stackdump, use the manage_sites management command -
To delete certain sites from Stackdump, use the @manage_sites@ management command -
@stackdump_dir/manage.sh manage_sites -l@ to list the sites (and their site keys) currently in the system;
@stackdump_dir/manage.sh manage_sites -d site_key@ to delete a particular site.
@@ -165,6 +220,7 @@ Stackdump leverages several open-source projects to do various things, including
* "markdown":http://pypi.python.org/pypi/Markdown for rendering comments
* "mathjax":http://www.mathjax.org/ for displaying mathematical expressions properly
* "httplib2":http://code.google.com/p/httplib2/ as a dependency of pysolr
* "requests":https://github.com/kennethreitz/requests/ as a dependency of pysolr
* "Apache Solr":http://lucene.apache.org/solr/ for search functionality
h2. Things not supported... yet

View File

@@ -110,6 +110,10 @@
<!-- we'll get the values out of the JSON, so most fields are not stored -->
<!-- fields are listed here so searches can be performed against them -->
<!-- this is used by Lucene to uniquely identify a post across all sites.
It is of the form "siteKey-id" and is necessary because post IDs are
reused across sites. -->
<field name="documentId" type="string" indexed="true" stored="true" required="true" />
<!-- the ID field needs to be a string for the QueryElevationComponent -->
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="siteKey" type="string" indexed="true" stored="true" required="true" />
@@ -196,7 +200,7 @@
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
<uniqueKey>documentId</uniqueKey>
<!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
parsing a query string that isn't explicit about the field. Machine (non-user)

View File

@@ -35,5 +35,5 @@ else
# shift off the command name so we don't pass it on
shift
$SCRIPT_DIR/start_python.sh $command "$@"
$SCRIPT_DIR/start_python3.sh $command "$@"
fi

File diff suppressed because it is too large Load Diff

View File

@@ -410,6 +410,7 @@ def view_question(site_key, question_id, answer_id=None):
result = results.docs[0]
convert_comments_to_html(result)
if settings.REWRITE_LINKS_AND_IMAGES:
rewrite_result(result)
sort_answers(result)
context['result'] = result
@@ -826,7 +827,12 @@ def _rewrite_html(html, app_url_root, sites_by_urls):
internal_link = False
url = t.get('href', None)
if url:
try:
host = urllib2.Request(url).get_host()
except ValueError:
# invalid URL or local anchor, leaving as-is
pass
else:
site = sites_by_urls.get(host, None)
if site:
# rewrite this URL for stackdump
@@ -835,18 +841,17 @@ def _rewrite_html(html, app_url_root, sites_by_urls):
question_id = question_id.groupdict()['id']
url = '%s%s/%s' % (app_url_root, site.key, question_id)
t.set('href', url)
t.set('class', t.get('class', '') + ' internal-link')
internal_link = True
answer_id = SE_ANSWER_ID_RE.search(url)
if answer_id:
answer_id = answer_id.groupdict()['id']
url = '%s%s/a/%s' % (app_url_root, site.key, answer_id)
t.set('href', url)
t.set('class', t.get('class', '') + ' internal-link')
internal_link = True
if not internal_link:
if internal_link:
t.set('class', t.get('class', '') + ' internal-link')
else:
t.set('class', t.get('class', '') + ' external-link')
# get a string back

View File

@@ -0,0 +1 @@
../default_settings.py

View File

@@ -1,47 +1,142 @@
#!/usr/bin/env python
# This script downloads the sites RSS file and associated logos from the net.
import urllib
import tarfile
import urllib.request
from xml.etree import ElementTree
import os
import sys
def printf(format, *args):
sys.stdout.write(format % args)
from shutil import copy
import os, ssl, fnmatch
from optparse import OptionParser
from xml.etree import ElementTree
import elasticsearch
import settings
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle
from pysolr import Solr, SolrError
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_defat_https_context = ssl._create_unverified_context
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
sites_path = os.path.join(se_dir, 'Sites.xml')
script_dir = os.path.dirname(sys.argv[0])
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
sites_file_path = os.path.join(script_dir, ''
'../../../../data/')
# ensure the data directory exists
# download the sites RSS file
if not os.path.exists(os.path.dirname(sites_file_path)):
os.mkdir(os.path.dirname(sites_file_path))
# download the sites RSS file
print 'Downloading StackExchange sites RSS file...',
urllib.urlretrieve('http://stackexchange.com/feeds/sites', sites_file_path)
print 'done.'
print('Downloading StackExchange sites XML file...')
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('done.')
print ''
print('')
# parse sites RSS file and download logos
logos_dir_path = os.path.join(script_dir, '../../../media/images/logos')
# parse sites RSS file and download logosc
images_dir_path = os.path.join(script_dir, '../../../media/images')
print(os.listdir(images_dir_path))
logos_dir_path = os.path.join(images_dir_path, 'logos48')
if not os.path.exists(logos_dir_path):
os.mkdir(logos_dir_path)
icons_dir_path = os.path.join(images_dir_path, 'icons')
if not os.path.exists(icons_dir_path):
os.mkdir(icons_dir_path)
badges_dir_path = os.path.join(images_dir_path, 'badges')
if not os.path.exists(badges_dir_path):
os.mkdir(badges_dir_path)
with open(sites_file_path) as f:
with open(sites_path) as f:
sites_file = ElementTree.parse(f)
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
for entry in entries:
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text.encode('ascii', 'ignore')
sites = sites_file.findall('row')
# print(rows[0].attrib)
for site in sites:
site_title = site.attrib['LongName']
site_name = site.attrib['Name']
# extract the key from the url - remove the http:// and .com
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
if site_key.startswith('http://'):
site_key = site_key[len('http://'):]
if site_key.endswith('.com'):
site_key = site_key[:-len('.com')]
if site_key.endswith('.stackexchange'):
site_key = site_key[:-len('.stackexchange')]
site_key = site.attrib['TinyName']
site_url = site.attrib['Url'][8:]
logo_url = site.attrib['ImageUrl']
icon_url = site.attrib['IconUrl']
badge_url = site.attrib['BadgeIconUrl']
print 'Downloading logo for %s...' % entry_title,
urllib.urlretrieve('http://sstatic.net/%s/img/icon-48.png' % site_key, os.path.join(logos_dir_path, '%s.png' % site_key))
print 'done.'
site_vars = (site_url, site_key, site_name, site_title)
# print(site_vars)
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
try:
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
if not os.path.exists(logo_file):
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
except Exception as e:
print('Failed download logo for %s...' % site_title, str(e))
try:
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
if not os.path.exists(icon_path):
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
except:
print('Failed download ico for %s...' % site_title, icon_url)
try:
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
if not os.path.exists(icon_path):
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
except:
printf('Failed download badge for %s...' % site_title)
site_files = []
print('Key: ' + site_url)
for root, dirs, files in os.walk(se_dir):
for name in files:
if fnmatch.fnmatch(name, site_url + '*'):
print('Match: ' + os.path.join(root, name))
site_files.append(os.path.join(root, name))
sites_data = sites_file_path
for site_file in site_files:
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + 'xml'
os.makedirs(dst, exist_ok=True)
os.chdir(dst)
os.system('tar xzf '+site_file)
print('Data: ' + site_file)
def prepare_site(xml_root, dump_date, site_key):
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
print('The given XML root path does not exist.')
sys.exit(1)
# connect to the database
print('Connecting to the Stackdump database...')
conn_str = settings.DATABASE_CONN_STR
sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n')
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %pro'
'g [options] xml_root_dir')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)

File diff suppressed because it is too large Load Diff

View File

@@ -1,18 +1,17 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# This script takes extracted site files and inserts them into the database.
from __future__ import with_statement
import sys
import os
import time
import xml.sax
from datetime import datetime
import re
import urllib2
import urllib
import socket
import tempfile
import traceback
from optparse import OptionParser
from xml.etree import ElementTree
@@ -20,16 +19,10 @@ from sqlobject import sqlhub, connectionForURI, AND, IN, SQLObject, \
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle
from pysolr import Solr
from pysolr import Solr, SolrError
from stackdump.models import Site, Badge, User
from stackdump import settings
try:
# For Python < 2.6 or people using a newer version of simplejson
import simplejson as json
except ImportError:
# For Python >= 2.6
import json
script_dir = os.path.dirname(sys.argv[0])
@@ -78,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
except Exception, e:
except Exception as e:
# could not insert this, so ignore the row
print('Exception: ' + str(e))
import traceback
@@ -111,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '')
d['date'] = attrs.get('Date')
except Exception, e:
except Exception as e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
@@ -145,7 +138,7 @@ class CommentContentHandler(BaseContentHandler):
d['creationDate'] = attrs.get('CreationDate')
d['userId'] = int(attrs.get('UserId', 0))
except Exception, e:
except Exception as e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
@@ -195,7 +188,7 @@ class UserContentHandler(BaseContentHandler):
d['upVotes'] = int(attrs.get('UpVotes', 0))
d['downVotes'] = int(attrs.get('DownVotes', 0))
except Exception, e:
except Exception as e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
@@ -242,7 +235,7 @@ class PostContentHandler(xml.sax.ContentHandler):
if hasattr(obj, 'isoformat'):
return obj.isoformat()
else:
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj))
raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
def startElement(self, name, attrs):
if name != 'row':
@@ -260,8 +253,20 @@ class PostContentHandler(xml.sax.ContentHandler):
d['answers'] = [ ]
d['answerCount'] = int(attrs.get('AnswerCount', 0))
d['viewCount'] = int(attrs.get('ViewCount', 0))
elif attrs['PostTypeId'] == '3':
raise ValueError('Skipping row ID [%s] as it is an orphaned tag wiki page (PostTypeId [3]).' % (attrs.get('Id', -1)))
elif attrs['PostTypeId'] == '4':
raise ValueError('Skipping row ID [%s] as it is a tag wiki excerpt (PostTypeId [4]).' % (attrs.get('Id', -1)))
elif attrs['PostTypeId'] == '5':
raise ValueError('Skipping row ID [%s] as it is a tag wiki page (PostTypeId [5]).' % (attrs.get('Id', -1)))
elif attrs['PostTypeId'] == '6':
raise ValueError('Skipping row ID [%s] as it is a moderator nomination post (PostTypeId [6]).' % (attrs.get('Id', -1)))
elif attrs['PostTypeId'] == '7':
raise ValueError('Skipping row ID [%s] as it is a wiki placeholder page (PostTypeId [7]).' % (attrs.get('Id', -1)))
elif attrs['PostTypeId'] == '8':
raise ValueError('Skipping row ID [%s] as it is an privilege wiki page (PostTypeId [8]).' % (attrs.get('Id', -1)))
else:
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]. Probably a tag wiki page.' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
raise ValueError('Unknown PostTypeId [%s] for row ID [%s].' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
if 'AcceptedAnswerId' in attrs:
d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
@@ -287,7 +292,7 @@ class PostContentHandler(xml.sax.ContentHandler):
d['comments'] = [ ]
except Exception, e:
except Exception as e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
@@ -333,7 +338,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# remove orphan answers from the orphan list
del self.orphan_answers[d['id']]
except Exception, e:
except Exception as e:
# could not insert this, so ignore the row
print('Exception: ' + str(e))
import traceback
@@ -363,7 +368,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# question is complete, store it.
questions_to_commit.append(self.finalise_question(q))
except Exception, e:
except Exception as e:
# could not serialise and insert this question, so ignore it
print('Exception: ' + str(e))
import traceback
@@ -459,6 +464,9 @@ class PostContentHandler(xml.sax.ContentHandler):
doc['answers-json'] = [ json.dumps(a, default=self.json_default_handler) for a in q['answers'] ]
# map other fields to search index doc
# this is the ID for Solr to uniquely identify this question across all
# sites
doc['documentId'] = self.site.key + '-' + str(q['id'])
doc['id'] = str(q['id'])
doc['siteKey'] = self.site.key
doc['creationDate'] = q['creationDate']
@@ -491,7 +499,7 @@ class PostContentHandler(xml.sax.ContentHandler):
if q['acceptedAnswerId'] in post_ids:
question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
else:
print 'Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], )
print('Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], ))
question_obj['creationDate'] = q['creationDate']
question_obj['score'] = q['score']
question_obj['viewCount'] = q['viewCount']
@@ -522,8 +530,31 @@ class PostContentHandler(xml.sax.ContentHandler):
By default, they are committed immediately. Set the ``commit`` argument
to False to disable this behaviour.
This function will loop if a SolrError is encountered to allow the user
to retry the commit without having to start again from the beginning,
e.g. if the Solr instance stops responding.
"""
while True:
try:
self.solr.add(questions, commit=commit)
break
except SolrError:
print('A Solr error occurred while committing questions - ')
traceback.print_exc(file=sys.stdout)
print('')
while True:
response = raw_input('Try committing the questions again? (y/n) ').lower()
if response not in ('y', 'n'):
print("Answer either 'y' or 'n'. Answering 'n' will abort the import process.")
else:
print('')
if response == 'y':
break
else:
raise
except:
raise
def commit_all_questions(self):
"""
@@ -541,7 +572,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# question is complete, store it.
questions_to_commit.append(self.finalise_question(q))
except Exception, e:
except Exception as e:
# could not serialise and insert this question, so ignore it
print('Exception: ' + str(e))
import traceback
@@ -610,8 +641,7 @@ def get_file_path(dir_path, filename):
return os.path.abspath(os.path.join(dir_path, matches[0]))
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
site_base_url, answer_yes=False):
def import_site(xml_root, dump_date, import_key):
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
@@ -624,18 +654,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n')
# connect to solr
print('Connecting to solr...')
solr = Solr(settings.SOLR_URL, assume_clean=True)
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
try:
solr._send_request('GET', 'admin/ping')
except socket.error, e:
print('Failed to connect to solr - error was: %s' % str(e))
print('Aborting.')
sys.exit(2)
print('Connected.\n')
# ensure required tables exist
print("Creating tables if they don't exist...")
Site.createTable(ifNotExists=True)
@@ -644,117 +662,30 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
print('Created.\n')
# SITE INFO
# only look if they were not specified at the command line; also only if
# readme.txt exists (they don't in dumps after Aug 2012)
readme_path = get_file_path(xml_root, 'readme.txt')
if not (site_name and dump_date) and readme_path:
# get the site name from the first line of readme.txt. This could be fragile.
with open(readme_path, 'r') as f:
site_readme_desc = f.readline().strip()
# only look if they were not specified at the command line;
# assume if there's a colon in the name, the name part is before, and the date
# part is after.
if ':' in site_readme_desc:
readme_site_name, readme_dump_date = site_readme_desc.split(':')
readme_site_name = readme_site_name.strip()
readme_dump_date = readme_dump_date.strip()
else:
readme_site_name = site_readme_desc
readme_dump_date = None
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
sites_path = os.path.join(se_dir, 'Sites.xml')
# if the phrase ' - Data Dump' is in the readme site name, remove it
i = readme_site_name.rfind(' - Data Dump')
if i >= 0:
readme_site_name = readme_site_name[:i].strip()
if not site_name:
site_name = readme_site_name
if not dump_date:
dump_date = readme_dump_date
# look for the site in the sites RSS file using the base_url with the id in RSS
# scrub the URL scheme off the base_url
if site_base_url:
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
# can strip off other bits in the URL that we don't want
if '://' not in site_base_url:
site_base_url = 'http://%s' % site_base_url
site_base_url = urllib2.Request(site_base_url).get_host()
# attempt to get more information from the sites RSS cache
if site_base_url and not (site_name and site_desc and site_key):
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
if os.path.exists(sites_file_path):
with open(sites_file_path) as f:
with open(sites_path) as f:
sites_file = ElementTree.parse(f)
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
sites = sites_file.findall('row')
# print(rows[0].attrib)
for entry in entries:
entry_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text
if '://' in entry_base_url:
entry_base_url = urllib2.Request(entry_base_url).get_host()
if site_base_url == entry_base_url:
# this entry matches the detected site id
if not site_key:
# extract the key from the url
rss_site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
# remove the URL scheme
if '://' in rss_site_key:
rss_site_key = rss_site_key[rss_site_key.find('://')+3:]
# remove the TLD
if rss_site_key.rfind('.') >= 0:
rss_site_key = rss_site_key[:rss_site_key.rfind('.')]
# remove the .stackexchange bit
if '.stackexchange' in rss_site_key:
rss_site_key = rss_site_key[:rss_site_key.find('.stackexchange')]
for site in sites:
site_title = site.attrib['LongName']
site_name = site.attrib['Name']
# extract the key from the url - remove the http:// and .com
site_key = site.attrib['TinyName']
site_url = site.attrib['Url'][8:]
logo_url = site.attrib['ImageUrl']
icon_url = site.attrib['IconUrl']
badge_url = site.attrib['BadgeIconUrl']
site_key = rss_site_key
if not site_name:
site_name = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
if not site_desc:
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (
site_name.encode('ascii', 'ignore') if site_name else None,
site_key,
site_desc.encode('ascii', 'ignore') if site_desc else None,
dump_date,
site_base_url
)
# the base URL is optional.
if not (site_name and site_key and site_desc and dump_date):
print 'Could not get all the details for the site.'
print 'Use command-line parameters to specify the missing details (listed as None).'
sys.exit(1)
# prevent importing sites with keys that clash with method names in the app,
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
if site_key in ('search', 'import', 'media', 'licenses'):
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
print 'Use the --site-key parameter to specify an alternate site key.'
sys.exit(2)
# confirm site details with user to make sure we don't accidentally overwrite
# another site.
if not answer_yes:
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
confirm_answer = raw_input(confirm_prompt)
if confirm_answer != 'yes':
print 'Import aborted on user request.'
sys.exit(3)
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
# this import process is aborted. Solr doesn't have the concept of transactions
# like databases do, so without a rollback, we'll be committing the previously
# uncommitted entries plus the newly imported ones.
#
# This also means multiple dataproc processes cannot occur concurrently. If you
# do the import will be silently incomplete.
print('Clearing any uncommitted entries in solr...')
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
print('Cleared.\n')
if (import_key != '') and (import_key != site_key):
continue
else:
print('site_name: '+site_name)
# check if site is already in database; if so, purge the data.
site = list(Site.select(Site.q.key==site_key))
@@ -870,19 +801,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)

View File

@@ -0,0 +1 @@
../settings.py

View File

@@ -32,6 +32,9 @@ NUM_OF_DEFAULT_COMMENTS = 3
# number of random questions to show on search query pages
NUM_OF_RANDOM_QUESTIONS = 3
# rewrite links and images to point internally or to a placeholder respectively
REWRITE_LINKS_AND_IMAGES = True
# settings that are available in templates
TEMPLATE_SETTINGS = [
'APP_URL_ROOT',

View File

@@ -34,3 +34,6 @@ from default_settings import *
# number of random questions to show on search query pages
#NUM_OF_RANDOM_QUESTIONS = 3
# rewrite links and images to point internally or to a placeholder respectively
#REWRITE_LINKS_AND_IMAGES = True

1
questions (1).json Normal file

File diff suppressed because one or more lines are too long

1
questions.json Normal file

File diff suppressed because one or more lines are too long

BIN
schema.xlsx Normal file

Binary file not shown.

54
start_python3.sh Executable file
View File

@@ -0,0 +1,54 @@
#!/bin/bash
##
# This script attempts to find a version of Python on the system PATH, and
# checks that it is 2.5+.
#
# A alternate Python command can be specified in a file named PYTHON_CMD in this
# script's directory. This path will override any lookup on the system PATH.
##
# FUNCTIONS
function checkPythonVersion {
if [ ! -z "$1" ]
then
PYTHON_VER_MAJOR=`echo $1 | cut -d "." -f 1`
PYTHON_VER_MINOR=`echo $1 | cut -d "." -f 2`
if [ $PYTHON_VER_MAJOR -eq "3" -a $PYTHON_VER_MINOR -ge "5" ]
then
return 1
fi
fi
return 0
}
# MAIN
SCRIPT_DIR=`dirname $0`
PYTHON_CMD=python3
# if there is a PYTHON_CMD file in the script directory, use that instead
if [ -e "$SCRIPT_DIR/PYTHON_CMD" ]
then
PYTHON_CMD=`cat "$SCRIPT_DIR/PYTHON_CMD"`
fi
if [ ! -z "`which "$PYTHON_CMD" 2>/dev/null`" ]
then
# check if Python is the right version
PYTHON_VER=`"$PYTHON_CMD" -V 2>&1 | cut -d " " -f 2`
checkPythonVersion "$PYTHON_VER"
if [ $? == 1 ]
then
echo "Using Python `which "$PYTHON_CMD"`"
# execution ends here if Python is found
PYTHONPATH=$SCRIPT_DIR/pyth3/packages:$SCRIPT_DIR/python/src:$PYTHONPATH
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
exit $?
fi
fi
# if we get here, it means the right version of Python was not found
echo 'No suitable version of Python was found. Python 2.5 or later is required.'