#-------------------------------------------------------------------------------
# Name:        module1
# Purpose:
#
# Author:      new
#
# Created:     03/11/2013
# Copyright:   (c) new 2013
# Licence:     <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python

import re
import HTMLParser
import BeautifulSoup
import types


def readfile(path):
    #read a file
    f = open(path, 'rU')
    data = f.read()
    f.close()
    return data

def deescape(html):
    # de-escape html
    # http://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1
    deescaped_string = HTMLParser.HTMLParser().unescape(html)
    return deescaped_string

def replace_with_newlines(html):
    element = BeautifulSoup.BeautifulSoup(html)
    # http://stackoverflow.com/questions/10491223/how-can-i-turn-br-and-p-into-line-breaks
    text = ''
    for elem in element.recursiveChildGenerator():
        if isinstance(elem, types.StringTypes):
            text += elem.strip()
        elif elem.name == 'br':
            text += '\n'
    return text


def extract_raw_paste_text(html):
    # Extract paste from paste code area
    soup = BeautifulSoup.BeautifulSoup(html)
    text_area_element = soup.find("textarea", {"id": "paste_code"})
    text_area_text = text_area_element.text
    assert( len(text_area_text) > 1 ) #make sure the text was given
    return text_area_text












def extract_display_area(html):
    # Extract only the display area html
    soup = BeautifulSoup.BeautifulSoup(html)
    display_area_element = soup.find("div", {"class": "text"})
    display_area_html = unicode((display_area_element))
    return display_area_html

def extract_paste_from_display(html):
    #fallback code
    # narrow down to display area
    display_area_html = extract_display_area(html)
    print display_area_html
    # seperate lines

    # deescape
    display_area_text = deescape(display_area_html)

    # Add newline after each line

    return display_area_text


def check_if_pastebin_html(html):
    # Check if html looks like a pastebin page
    slogan_string = "#1 paste tool since 2002"
    if slogan_string in html:
        print "File was pastebin html!"
        return True
    else:
        return False


def check_if_private_paste(html):
    # Check if html looks like a private paste
    private_paste_warning_string = "This is a private paste. If you created this paste, please"
    if private_paste_warning_string in html:
        print "Paste is private!"
        return True
    else:
        return False


def grab_paste(html):
    # Extract text from pastebin html
    if check_if_private_paste(html):
        return ""
    try:
        raw_paste_text = extract_raw_paste_text(html)
        paste_text = deescape(raw_paste_text)
        assert( len(paste_text) > 1 ) #make sure the text was given
        return paste_text
    except AssertionError:
        raw_paste_text = extract_paste_from_display(html)
        paste_text = deescape(raw_paste_text)
        assert( len(paste_text) > 1 ) #make sure the text was given
        return paste_text


def main():
    #testing
    html = readfile("tests/pastebin/Untitled.2HtWWAps.1.htm")
    print grab_paste(html)
    print check_if_pastebin_html(html)

if __name__ == '__main__':
    main()
