Python update

2025-03-31 14:53:26 +00:00 · 2021-06-10 09:46:47 +01:00 · 2021-06-10 09:46:47 +01:00 · 4616d04c56
commit 4616d04c56
parent f53efd1422
9 changed files with 1000 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,7 +4,7 @@
 .DS_Store

 # ignore any data
-^data/*$
+data

 # ignore working bytecode
 \.class$
--- a/python/src/stackdump/commands/default_settings.py
+++ b/python/src/stackdump/commands/default_settings.py
@ -0,0 +1 @@
+../default_settings.py
--- a/python/src/stackdump/commands/download_site_info.py
+++ b/python/src/stackdump/commands/download_site_info.py
@ -1,33 +1,49 @@
 #!/usr/bin/env python

 # This script downloads the sites RSS file and associated logos from the net.
-
+import tarfile
 import urllib.request
 from xml.etree import ElementTree
 import sys
-import os, ssl
+def printf(format, *args):
+    sys.stdout.write(format % args)
+from shutil import copy
+import os, ssl, fnmatch
+from optparse import OptionParser
+from xml.etree import ElementTree
+import elasticsearch
+
+import settings
+from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
+    UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
+from sqlobject.sqlbuilder import Delete, Insert
+from sqlobject.styles import DefaultStyle
+from pysolr import Solr, SolrError

 if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
        getattr(ssl, '_create_unverified_context', None)):
-    ssl._create_default_https_context = ssl._create_unverified_context
+    ssl._create_defat_https_context = ssl._create_unverified_context

 se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
 sites_path = os.path.join(se_dir, 'Sites.xml')

 script_dir = os.path.dirname(sys.argv[0])
-sites_file_path = os.path.join(script_dir, '../../../../data/sites')
-
+sites_file_path = os.path.join(script_dir, ''
+                                           '../../../../data/')
 # ensure the data directory exists\\\\
+# download the sites RSS file
+
 if not os.path.exists(os.path.dirname(sites_file_path)):
    os.mkdir(os.path.dirname(sites_file_path))

-# download the sites RSS file
 print('Downloading StackExchange sites XML file...', )
-urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
+# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
 print('done.')

 print('')

+
+
 # parse sites RSS file and download logosc
 images_dir_path = os.path.join(script_dir, '../../../media/images')
 print(os.listdir(images_dir_path))
@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):

 with open(sites_path) as f:
    sites_file = ElementTree.parse(f)
-    rows = sites_file.findall('row')
+    sites = sites_file.findall('row')
    # print(rows[0].attrib)

-    for row in rows:
-        entry_title = row.attrib['Name']
-        print('Entry: ' + entry_title)
-
+    for site in sites:
+        site_title = site.attrib['LongName']
+        site_name = site.attrib['Name']
        # extract the key from the url - remove the http:// and .com
-        site_key = row.attrib['TinyName']
-        print('Site: ' + site_key)
-        logo_url = row.attrib['ImageUrl']
-        icon_url = row.attrib['IconUrl']
-        badge_url = row.attrib['BadgeIconUrl']
+        site_key = site.attrib['TinyName']
+        site_url = site.attrib['Url'][8:]
+        logo_url = site.attrib['ImageUrl']
+        icon_url = site.attrib['IconUrl']
+        badge_url = site.attrib['BadgeIconUrl']
+
+
+        site_vars = (site_url, site_key, site_name, site_title)
+        # print(site_vars)
+        printf('Site: %s,  key=%s, name="%s", longname="%s"\n' % site_vars)
+        try:
+            logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
+            if not os.path.exists(logo_file):
+                print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
+        except:
+            print('Failed download logo for %s...' % site_title)

        try:
-            print('Downloading logo for %s...' % entry_title,
-                  urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
+            icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
+            if not os.path.exists(icon_path):
+                print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
        except:
-            print('Failed download logo for %s...' % entry_title)
+            print('Failed download ico for %s...' % site_title)

        try:
-            print('Downloading icon for %s...' % entry_title,
-              urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
+            badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
+            if not os.path.exists(icon_path):
+                print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
        except:
-            print('Failed download ico for %s...' % entry_title)
+            printf('Failed download badge for %s...' % site_title)

-        try:
-            print('Downloading badge for %s...' % entry_title,
-              urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
-        except:
-            print('Failed download badge for %s...' % entry_title)
+        site_files = []
+        print('Key: ' + site_url)
+        for root, dirs, files in os.walk(se_dir):
+            for name in files:
+                if fnmatch.fnmatch(name, site_url + '*'):
+                    print('Match: ' + os.path.join(root, name))
+                    site_files.append(os.path.join(root, name))

-print('done.')
+        sites_data = sites_file_path
+        for site_file in site_files:
+            dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
+                  + os.sep + os.path.basename(site_file)
+            os.makedirs(dst, exist_ok=True)
+            os.chdir(dst)
+            os.system('tar xzf '+site_file)
+            print('Data: ' + site_file)

+def prepare_site(xml_root, dump_date, site_key):
+    print('Using the XML root path: ' + xml_root + '\n')
+
+    if not os.path.exists(xml_root):
+        print('The given XML root path does not exist.')
+        sys.exit(1)
+
+    # connect to the database
+    print('Connecting to the Stackdump database...')
+    conn_str = settings.DATABASE_CONN_STR
+    sqlhub.processConnection = connectionForURI(conn_str)
+    print('Connected.\n')

 # MAIN METHOD
 if __name__ == '__main__':
-    parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
-    parser.add_option('-n', '--site-name', help='Name of the site.')
-    parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
+    parser = OptionParser(usage='usage: %pro'
+                                'g [options] xml_root_dir')
    parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
    parser.add_option('-c', '--dump-date', help='Dump date of the site.')
-    parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
-    parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)

    (cmd_options, cmd_args) = parser.parse_args()

@ -94,6 +140,4 @@ if __name__ == '__main__':
        print('The path to the directory containing the extracted XML files is required.')
        sys.exit(1)

-    prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
-                cmd_options.site_desc, cmd_options.site_key,
-                cmd_options.base_url, answer_yes=cmd_options.answer_yes)
+    prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
--- a/python/src/stackdump/commands/import_recent.py
+++ b/python/src/stackdump/commands/import_recent.py
--- a/python/src/stackdump/commands/import_site.py
+++ b/python/src/stackdump/commands/import_site.py
@ -2,8 +2,6 @@

 # This script takes extracted site files and inserts them into the database.

-from __future__ import with_statement
-
 import sys
 import os
 import time
@ -25,13 +23,7 @@ from pysolr import Solr, SolrError

 from stackdump.models import Site, Badge, User
 from stackdump import settings
-
-try:
-    # For Python < 2.6 or people using a newer version of simplejson
-    import simplejson as json
-except ImportError:
-    # For Python >= 2.6
-    import json
+import json

 script_dir = os.path.dirname(sys.argv[0])

@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
    return os.path.abspath(os.path.join(dir_path, matches[0]))


-def import_site(xml_root, site_name, dump_date, site_desc, site_key,
-                site_base_url, answer_yes=False):
+def import_site(xml_root, dump_date,site_key)
    print('Using the XML root path: ' + xml_root + '\n')

    if not os.path.exists(xml_root):
@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
 # MAIN METHOD
 if __name__ == '__main__':
    parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
-    parser.add_option('-n', '--site-name', help='Name of the site.')
-    parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
    parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
    parser.add_option('-c', '--dump-date', help='Dump date of the site.')
-    parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
-    parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
-
    (cmd_options, cmd_args) = parser.parse_args()

    if len(cmd_args) < 1:
        print('The path to the directory containing the extracted XML files is required.')
        sys.exit(1)

-    import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
-                cmd_options.site_desc, cmd_options.site_key,
-                cmd_options.base_url, answer_yes=cmd_options.answer_yes)
+    import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
--- a/python/src/stackdump/commands/settings.py
+++ b/python/src/stackdump/commands/settings.py
@ -0,0 +1 @@
+../settings.py
--- a/(1).json
+++ b/(1).json
--- a/questions.json
+++ b/questions.json
--- a/schema.xlsx
+++ b/schema.xlsx