mirror of
https://github.com/djohnlewis/stackdump
synced 2025-12-06 07:53:28 +00:00
Renamed the commands directory and added a script to make them easier to call.
Also deleted the get_sites script as it wasn't very useful, and renamed others to be more meaningful.
This commit is contained in:
@@ -1,38 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script reads a directory containing the Stack Exchange data dump and
|
||||
# returns a list of sites.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
CONTENT_FILENAME_RE = re.compile(r'^(.+)\.7z$|^(.+)\.7z\.\d{3}$')
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print('One argument is expected - the path to the data dump directory.')
|
||||
sys.exit(1)
|
||||
|
||||
dump_path = sys.argv[1]
|
||||
print('Using the data dump path: ' + dump_path + '\n')
|
||||
|
||||
if not os.path.exists(dump_path):
|
||||
print('The given data dump path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# we expect it to contain an 'Content' directory
|
||||
dump_path = os.path.join(dump_path, 'Content')
|
||||
if not os.path.exists(dump_path):
|
||||
print('The given data dump path is invalid. The Content subdirectory was expected.')
|
||||
sys.exit(1)
|
||||
|
||||
filenames = os.listdir(dump_path)
|
||||
sites = set()
|
||||
|
||||
for f in filenames:
|
||||
match = CONTENT_FILENAME_RE.match(f)
|
||||
if match:
|
||||
sites.add(match.group(match.lastindex))
|
||||
|
||||
print sites
|
||||
|
||||
Reference in New Issue
Block a user