1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-04-07 02:03:27 +00:00

Modified import.py so it no longer relies on readme.txt.

readme.txt files were dropped as of the August 2012 data dump.
This commit is contained in:
Samuel Lai 2012-08-12 15:40:48 +10:00
parent dd24d98b39
commit 1f29fd9113

@ -583,32 +583,49 @@ print('Created.\n')
# SITE INFO # SITE INFO
site_name = cmd_options.site_name site_name = cmd_options.site_name
dump_date = cmd_options.dump_date dump_date = cmd_options.dump_date
# only look if they were not specified at the command line # only look if they were not specified at the command line; also only if
if not (site_name and dump_date): # readme.txt exists (they don't in dumps after Aug 2012)
readme_path = os.path.join(xml_root, 'readme.txt')
if not (site_name and dump_date) and os.path.exists(readme_path):
# get the site name from the first line of readme.txt. This could be fragile. # get the site name from the first line of readme.txt. This could be fragile.
with open(os.path.join(xml_root, 'readme.txt')) as f: with open() as f:
site_readme_desc = f.readline().strip() site_readme_desc = f.readline().strip()
# assume if there's a colon in the name, the name part is before, and the date # assume if there's a colon in the name, the name part is before, and the date
# part is after. # part is after.
if ':' in site_readme_desc: if ':' in site_readme_desc:
site_name, dump_date = site_readme_desc.split(':') readme_site_name, readme_dump_date = site_readme_desc.split(':')
site_name = site_name.strip() readme_site_name = readme_site_name.strip()
dump_date = dump_date.strip() readme_dump_date = readme_dump_date.strip()
else: else:
site_name = site_readme_desc readme_site_name = site_readme_desc
dump_date = None readme_dump_date = None
# if the phrase ' - Data Dump' is in the site name, remove it # if the phrase ' - Data Dump' is in the readme site name, remove it
i = site_name.rfind(' - Data Dump') i = readme_site_name.rfind(' - Data Dump')
if i >= 0: if i >= 0:
site_name = site_name[:i].strip() readme_site_name = readme_site_name[:i].strip()
if not site_name:
site_name = readme_site_name
if not dump_date:
dump_date = readme_dump_date
# look for the site in the sites RSS file # look for the site in the sites RSS file using the base_url with the id in RSS
site_desc = cmd_options.site_desc site_desc = cmd_options.site_desc
site_key = cmd_options.site_key site_key = cmd_options.site_key
site_base_url = cmd_options.base_url site_base_url = cmd_options.base_url
if not (site_desc and site_key and site_base_url):
# scrub the URL scheme off the base_url
if site_base_url:
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
# can strip off other bits in the URL that we don't want
if '://' not in site_base_url:
site_base_url = 'http://%s' % site_base_url
site_base_url = urllib2.Request(site_base_url).get_host()
# attempt to get more information from the sites RSS cache
if site_base_url and not (site_name and site_desc and site_key):
sites_file_path = os.path.join(script_dir, '../../../../data/sites') sites_file_path = os.path.join(script_dir, '../../../../data/sites')
if os.path.exists(sites_file_path): if os.path.exists(sites_file_path):
with open(sites_file_path) as f: with open(sites_file_path) as f:
@ -616,24 +633,30 @@ if not (site_desc and site_key and site_base_url):
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry') entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
for entry in entries: for entry in entries:
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text entry_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text
if site_name == entry_title: if '://' in entry_base_url:
# this entry matches the detected site name entry_base_url = urllib2.Request(entry_base_url).get_host()
# extract the key from the url - remove the http:// and .com if site_base_url == entry_base_url:
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text # this entry matches the detected site id
if site_key.startswith('http://'): if not site_key:
site_key = site_key[len('http://'):] # extract the key from the url
if site_key.endswith('.com'): rss_site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
site_key = site_key[:-len('.com')] # remove the URL scheme
if site_key.endswith('.stackexchange'): if '://' in rss_site_key:
site_key = site_key[:-len('.stackexchange')] rss_site_key = rss_site_key[rss_site_key.find('://')+3:]
# remove the TLD
if rss_site_key.rfind('.') >= 0:
rss_site_key = rss_site_key[:rss_site_key.rfind('.')]
# remove the .stackexchange bit
if '.stackexchange' in rss_site_key:
rss_site_key = rss_site_key[:rss_site_key.find('.stackexchange')]
site_key = rss_site_key
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip() if not site_name:
site_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text.strip() site_name = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
if not site_desc:
# scrub the URL scheme off the base_url site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
if site_base_url:
site_base_url = urllib2.Request(site_base_url).get_host()
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (site_name, site_key, site_desc, dump_date, site_base_url) print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (site_name, site_key, site_desc, dump_date, site_base_url)