Python update

2025-12-06 07:53:28 +00:00 · 2021-06-10 09:46:47 +01:00
parent f53efd1422
commit 4616d04c56
9 changed files with 1000 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 .DS_Store
 # ignore any data
-^data/*$
+data
 # ignore working bytecode
 \.class$
--- a/python/src/stackdump/commands/default_settings.py
+++ b/python/src/stackdump/commands/default_settings.py
@@ -0,0 +1 @@
 ../default_settings.py
--- a/python/src/stackdump/commands/download_site_info.py
+++ b/python/src/stackdump/commands/download_site_info.py
@@ -1,33 +1,49 @@
 #!/usr/bin/env python
 # This script downloads the sites RSS file and associated logos from the net.
-
+import tarfile
 import urllib.request
 from xml.etree import ElementTree
 import sys
-import os, ssl
+def printf(format, *args):
    sys.stdout.write(format % args)
 from shutil import copy
 import os, ssl, fnmatch
 from optparse import OptionParser
 from xml.etree import ElementTree
 import elasticsearch
 import settings
 from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
    UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
 from sqlobject.sqlbuilder import Delete, Insert
 from sqlobject.styles import DefaultStyle
 from pysolr import Solr, SolrError
 if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
        getattr(ssl, '_create_unverified_context', None)):
-    ssl._create_default_https_context = ssl._create_unverified_context
+    ssl._create_defat_https_context = ssl._create_unverified_context
 se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
 sites_path = os.path.join(se_dir, 'Sites.xml')
 script_dir = os.path.dirname(sys.argv[0])
-sites_file_path = os.path.join(script_dir, '../../../../data/sites')
+sites_file_path = os.path.join(script_dir, ''
-
+                                           '../../../../data/')
 # ensure the data directory exists\\\\
 # download the sites RSS file
 if not os.path.exists(os.path.dirname(sites_file_path)):
    os.mkdir(os.path.dirname(sites_file_path))
 # download the sites RSS file
 print('Downloading StackExchange sites XML file...', )
-urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
+# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
 print('done.')
 print('')
 # parse sites RSS file and download logosc
 images_dir_path = os.path.join(script_dir, '../../../media/images')
 print(os.listdir(images_dir_path))
@@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):
 with open(sites_path) as f:
    sites_file = ElementTree.parse(f)
-    rows = sites_file.findall('row')
+    sites = sites_file.findall('row')
    # print(rows[0].attrib)
-    for row in rows:
+    for site in sites:
-        entry_title = row.attrib['Name']
+        site_title = site.attrib['LongName']
-        print('Entry: ' + entry_title)
+        site_name = site.attrib['Name']
        # extract the key from the url - remove the http:// and .com
-        site_key = row.attrib['TinyName']
+        site_key = site.attrib['TinyName']
-        print('Site: ' + site_key)
+        site_url = site.attrib['Url'][8:]
-        logo_url = row.attrib['ImageUrl']
+        logo_url = site.attrib['ImageUrl']
-        icon_url = row.attrib['IconUrl']
+        icon_url = site.attrib['IconUrl']
-        badge_url = row.attrib['BadgeIconUrl']
+        badge_url = site.attrib['BadgeIconUrl']
        site_vars = (site_url, site_key, site_name, site_title)
        # print(site_vars)
        printf('Site: %s,  key=%s, name="%s", longname="%s"\n' % site_vars)
        try:
            logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
            if not os.path.exists(logo_file):
                print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
        except:
            print('Failed download logo for %s...' % site_title)
        try:
-            print('Downloading logo for %s...' % entry_title,
+            icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
-                  urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
+            if not os.path.exists(icon_path):
                print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
        except:
-            print('Failed download logo for %s...' % entry_title)
+            print('Failed download ico for %s...' % site_title)
        try:
-            print('Downloading icon for %s...' % entry_title,
+            badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
-              urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
+            if not os.path.exists(icon_path):
                print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
        except:
-            print('Failed download ico for %s...' % entry_title)
+            printf('Failed download badge for %s...' % site_title)
-        try:
+        site_files = []
-            print('Downloading badge for %s...' % entry_title,
+        print('Key: ' + site_url)
-              urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
+        for root, dirs, files in os.walk(se_dir):
-        except:
+            for name in files:
-            print('Failed download badge for %s...' % entry_title)
+                if fnmatch.fnmatch(name, site_url + '*'):
                    print('Match: ' + os.path.join(root, name))
                    site_files.append(os.path.join(root, name))
-print('done.')
+        sites_data = sites_file_path
        for site_file in site_files:
            dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
                  + os.sep + os.path.basename(site_file)
            os.makedirs(dst, exist_ok=True)
            os.chdir(dst)
            os.system('tar xzf '+site_file)
            print('Data: ' + site_file)
 def prepare_site(xml_root, dump_date, site_key):
    print('Using the XML root path: ' + xml_root + '\n')
    if not os.path.exists(xml_root):
        print('The given XML root path does not exist.')
        sys.exit(1)
    # connect to the database
    print('Connecting to the Stackdump database...')
    conn_str = settings.DATABASE_CONN_STR
    sqlhub.processConnection = connectionForURI(conn_str)
    print('Connected.\n')
 # MAIN METHOD
 if __name__ == '__main__':
-    parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
+    parser = OptionParser(usage='usage: %pro'
-    parser.add_option('-n', '--site-name', help='Name of the site.')
+                                'g [options] xml_root_dir')
    parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
    parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
    parser.add_option('-c', '--dump-date', help='Dump date of the site.')
    parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
    parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
    (cmd_options, cmd_args) = parser.parse_args()
@@ -94,6 +140,4 @@ if __name__ == '__main__':
        print('The path to the directory containing the extracted XML files is required.')
        sys.exit(1)
-    prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
+    prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
                cmd_options.site_desc, cmd_options.site_key,
                cmd_options.base_url, answer_yes=cmd_options.answer_yes)
--- a/python/src/stackdump/commands/import_recent.py
+++ b/python/src/stackdump/commands/import_recent.py
--- a/python/src/stackdump/commands/import_site.py
+++ b/python/src/stackdump/commands/import_site.py
@@ -2,8 +2,6 @@
 # This script takes extracted site files and inserts them into the database.
 from __future__ import with_statement
 import sys
 import os
 import time
@@ -25,12 +23,6 @@ from pysolr import Solr, SolrError
 from stackdump.models import Site, Badge, User
 from stackdump import settings
 try:
    # For Python < 2.6 or people using a newer version of simplejson
    import simplejson as json
 except ImportError:
    # For Python >= 2.6
 import json
 script_dir = os.path.dirname(sys.argv[0])
@@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
    return os.path.abspath(os.path.join(dir_path, matches[0]))
-def import_site(xml_root, site_name, dump_date, site_desc, site_key,
+def import_site(xml_root, dump_date,site_key)
                site_base_url, answer_yes=False):
    print('Using the XML root path: ' + xml_root + '\n')
    if not os.path.exists(xml_root):
@@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
 # MAIN METHOD
 if __name__ == '__main__':
    parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
    parser.add_option('-n', '--site-name', help='Name of the site.')
    parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
    parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
    parser.add_option('-c', '--dump-date', help='Dump date of the site.')
    parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
    parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
    (cmd_options, cmd_args) = parser.parse_args()
    if len(cmd_args) < 1:
        print('The path to the directory containing the extracted XML files is required.')
        sys.exit(1)
-    import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
+    import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
                cmd_options.site_desc, cmd_options.site_key,
                cmd_options.base_url, answer_yes=cmd_options.answer_yes)
--- a/python/src/stackdump/commands/settings.py
+++ b/python/src/stackdump/commands/settings.py
@@ -0,0 +1 @@
 ../settings.py
--- a/(1).json
+++ b/(1).json
--- a/questions.json
+++ b/questions.json
--- a/schema.xlsx
+++ b/schema.xlsx