SKE48 Blog - '+name+'

#!/usr/bin/env python # Version: 20160609-02 import os import urllib import re import sys from xml.dom import minidom def getEntryFromXml(name, index): xml = minidom.parse(name) item = xml.getElementsByTagName("item")[index] id = item.getElementsByTagName("guid")[0].firstChild.nodeValue link = item.getElementsByTagName("link")[0].firstChild.nodeValue timestamp = item.getElementsByTagName("pubDate")[0].firstChild.nodeValue title = item.getElementsByTagName("title")[0].firstChild.nodeValue return (id, link, timestamp, title) def processEntry(id, timestamp, name, title, month, newEntries): # Read HTML file to string html = open(month+"/"+id+".html", 'r') s = html.read() html.close() # Get blog entry from string matches = re.findall('.*', s, re.DOTALL) entry = matches[0] # Download image if there is one and update blog entry to use local image matches = re.findall('http.*jpg', entry) if matches: image = matches[0] image2 = re.sub('/blog/', '/blog2/', image) image3 = id+".jpg" urllib.urlretrieve(image2, filename=month+"/"+image3) matches = re.findall('', entry) p = matches[0] p = re.escape(p) img = '

' entry = re.sub(p, img, entry) # Write new HTML file for blog entry html = open(month+"/"+id+".html", 'w') html.write('\n\n\n\n\n\nSKE48 Blog - '+name+'\n\n\n

'+name+'

\n') html.write(entry) html.write('\n\n') html.close() # Add link to new HTML file to new entries list timestamp = re.sub(':00 \+0900', '', timestamp) newEntries.append(''+timestamp+' - '+name+': '+title+'') # This seems to be necessary for some string manipulations with re.sub reload(sys) sys.setdefaultencoding('utf8') # Load list of XML file names to work through from text file and list to hold links to new entries bloglist = [] for line in (open('bloglist', 'r').readlines()): bloglist.append(line[:-1]) newEntries = [] print "Downloading newest versions of XML files..." for name in bloglist: try: urllib.urlretrieve("http://www.ske48.co.jp/rss/blog_"+name+".xml", filename=name+".xml") except: print "Error downloading newest version of " + name + "'s XML file." print "Finished trying to download newest versions of XML files. Should have been successful if there's no error messages above." # Loop through bloglist downloading all new entries for name in bloglist: try: # Set number of entries to check from XML entryCount = 1 if name == "kenkyuuseiall": entryCount = 5 # Loop run through only once for regular member blogs but multiple times for kenkyuusei blog for index in range(0, entryCount): # Get info for blog entry entry = getEntryFromXml(name+".xml", index) id = entry[0] link = entry[1] timestamp = entry[2] title = entry[3] # Check month the entry was created in and create a new folder for the month if there is none month = id[0:6] if not os.path.exists(month): os.makedirs(month) # Get list of files in the directory for the month the entry was created in files = os.listdir(month) # Download and process HTML file if there's no local copy of the entry yet if not id+".html" in files: print "New entry from " + name + "! Processing..." urllib.urlretrieve(link, filename=month+"/"+id+".html") processEntry(id, timestamp, name, title, month, newEntries) print "Finished processing new entry for " + name + "." except: print "Error processing " + name + "'s local XML file." print "All local XML files checked. New entries: " + str(len(newEntries)) + "." if newEntries: # Sort links to new entries newEntries.sort() # # Make list of months that had new entries this run # months = [] # for item in newEntries: # month = item[9:15] # if not month in months: # months.append(month) # # Open index files for months with new entries # indices = [] # for month in months: # indices.append(open(month+'/index', 'a')) # # Append links to new entries to their month-respective index files # for item in newEntries: # month = item[9:15] # indices[months.index(month)].write(item+'\n') # # Close index files for months with new entries # for item in indices: # item.close() # Make set of months that had new entries this run and append links to new entries to their month-respective index files (wowaname's version of the longer block above) months = set() for item in newEntries: month = item[9:15] months.add(month) with open("%s/index" % month, 'a') as fd: fd.write("%s\n" % item) # Create updated index HTMLs from index files for month in months: ihtml = open(month+'/index.html', 'w') ihtml.write('\n\n\n\n\nSKE48 Blog\n\n\n') for line in reversed(open(month+'/index', 'r').readlines()): ihtml.write(line+'
') ihtml.write('\n\n') ihtml.close() print "Index HTML files updated."