#!/usr/bin/env python
# Version: 20160609-02

import os
import urllib
import re
import sys
from xml.dom import minidom

def getEntryFromXml(name, index):
    xml = minidom.parse(name)
    item = xml.getElementsByTagName("item")[index]
    id = item.getElementsByTagName("guid")[0].firstChild.nodeValue
    link = item.getElementsByTagName("link")[0].firstChild.nodeValue
    timestamp = item.getElementsByTagName("pubDate")[0].firstChild.nodeValue
    title = item.getElementsByTagName("title")[0].firstChild.nodeValue
    return (id, link, timestamp, title)

def processEntry(id, timestamp, name, title, month, newEntries):
    # Read HTML file to string
    html = open(month+"/"+id+".html", 'r')
    s = html.read()
    html.close()
    # Get blog entry from string 
    matches = re.findall('<!-- BLOG -->.*<!-- //BLOG -->', s, re.DOTALL)
    entry = matches[0]
    # Download image if there is one and update blog entry to use local image
    matches = re.findall('http.*jpg', entry)
    if matches:
        image = matches[0]
        image2 = re.sub('/blog/', '/blog2/', image)
        image3 = id+".jpg"
        urllib.urlretrieve(image2, filename=month+"/"+image3)
        matches = re.findall('<p.*</p>', entry)
        p = matches[0]
        p = re.escape(p)
        img = '<img src="'+image3+'">'
        entry = re.sub(p, img, entry)
    # Write new HTML file for blog entry
    html = open(month+"/"+id+".html", 'w')
    html.write('<!DOCTYPE HTML>\n<html>\n<head>\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<link href="../style.css" rel="stylesheet" type="text/css">\n<title>SKE48 Blog - '+name+'</title>\n</head>\n<body>\n<h3>'+name+'</h3>\n')
    html.write(entry)
    html.write('\n</body>\n</html>')
    html.close()
    # Add link to new HTML file to new entries list
    timestamp = re.sub(':00 \+0900', '', timestamp)
    newEntries.append('<a href="'+id+'.html">'+timestamp+' - '+name+': '+title+'</a>')

# This seems to be necessary for some string manipulations with re.sub
reload(sys)
sys.setdefaultencoding('utf8')

# Load list of XML file names to work through from text file and list to hold links to new entries
bloglist = []
for line in (open('bloglist', 'r').readlines()):
    bloglist.append(line[:-1])
newEntries = []

print "Downloading newest versions of XML files..."
for name in bloglist:
    try:
        urllib.urlretrieve("http://www.ske48.co.jp/rss/blog_"+name+".xml", filename=name+".xml")
    except:
        print "Error downloading newest version of " + name + "'s XML file."
print "Finished trying to download newest versions of XML files. Should have been successful if there's no error messages above."

# Loop through bloglist downloading all new entries
for name in bloglist:

    try:

        # Set number of entries to check from XML
        entryCount = 1
        if name == "kenkyuuseiall":
            entryCount = 5

        # Loop run through only once for regular member blogs but multiple times for kenkyuusei blog
        for index in range(0, entryCount):

            # Get info for blog entry
            entry = getEntryFromXml(name+".xml", index)
            id = entry[0]
            link = entry[1]
            timestamp = entry[2]
            title = entry[3]
        
            # Check month the entry was created in and create a new folder for the month if there is none
            month = id[0:6]
            if not os.path.exists(month):
                os.makedirs(month)
        
            # Get list of files in the directory for the month the entry was created in
            files = os.listdir(month)
        
            # Download and process HTML file if there's no local copy of the entry yet
            if not id+".html" in files:
                print "New entry from " + name + "! Processing..." 
                urllib.urlretrieve(link, filename=month+"/"+id+".html")
                processEntry(id, timestamp, name, title, month, newEntries)
                print "Finished processing new entry for " + name + "."

    except:
        print "Error processing " + name + "'s local XML file."

print "All local XML files checked. New entries: " + str(len(newEntries)) + "."

if newEntries:

    # Sort links to new entries
    newEntries.sort()

#    # Make list of months that had new entries this run
#    months = []
#    for item in newEntries:
#        month = item[9:15]
#        if not month in months:
#            months.append(month)
#    # Open index files for months with new entries
#    indices = []
#    for month in months:
#        indices.append(open(month+'/index', 'a'))
#    # Append links to new entries to their month-respective index files
#    for item in newEntries:
#        month = item[9:15]
#        indices[months.index(month)].write(item+'\n')
#    # Close index files for months with new entries
#    for item in indices:
#        item.close()

    # Make set of months that had new entries this run and append links to new entries to their month-respective index files (wowaname's version of the longer block above)
    months = set()
    for item in newEntries:
        month = item[9:15]
        months.add(month)
        with open("%s/index" % month, 'a') as fd:
            fd.write("%s\n" % item)

    # Create updated index HTMLs from index files
    for month in months:
        ihtml = open(month+'/index.html', 'w')
        ihtml.write('<!DOCTYPE HTML>\n<html>\n<head>\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<title>SKE48 Blog</title>\n</head>\n<body>\n')
        for line in reversed(open(month+'/index', 'r').readlines()):
            ihtml.write(line+'<br>')
        ihtml.write('\n</body>\n</html>')
        ihtml.close()
    print "Index HTML files updated."