import csv
import os
import re
from BeautifulSoup import *

# FUNCTIONS

def gettextonly(soup):
    """reverse through all html elements and return text only"""
    v = soup.string
    if v == None:   
        c = soup.contents
        resulttext = ''
        for t in c:
            subtext = gettextonly(t)
            resulttext += subtext + ' '
            resulttext = resulttext.strip()
        return resulttext
    else:
        v = BeautifulStoneSoup(v, 
                convertEntities = BeautifulStoneSoup.HTML_ENTITIES
                ).contents[0]
        return re.sub(' +|\n+', ' ', v).strip()

def tabletolist(souptbl):
    """convert html table into a list"""
    table = []
    rows = souptbl.findAll('tr')

    for row in rows:
        cols = row.findAll('td')
        table.append([gettextonly(c) for c in cols])

    return table        


# MAIN PART OF SCRIPT

# open files to write data in csv format
infowriter = csv.writer(open('info.csv', 'w'))
datawriter = csv.writer(open('data.csv', 'w'))

# read list of files from csv folder and order them
files = sorted(os.listdir('csv'))

out = []

for file in files:
    print 'processing: ', file

    # open and parse html file
    f = open('csv/' + file, 'r')
    html = f.read()
    soup = BeautifulSoup(html)
    t = soup.findAll('table')

    # extract general information about election and write into info csv file
    electionDate = re.search('\d\d/\d\d/\d{4}', html).group(0)
    
    info = tabletolist(t[0])
    electorate = info[0][1].replace('.', '')
    votescast = info[1][1].replace('.', '')
    votesvalid = info[0][4].replace('.', '')

    infowriter.writerow([electionDate, electorate, votescast, votesvalid])

    # extract election results and write into data csv file
    data = tabletolist(t[1])
    data = [ [d[0], int(d[1].replace('.', '')), float(d[2].replace(',', '.')),\
             int(d[3]) if d[3] else None] for d in data if d]

    datawriter.writerows([ [electionDate] + line for line in data ])


