Grab text or source from HTML pages

import win32com.client
from time import sleep

def download_url_with_ie(url):
    """
    Given a url, it starts IE, loads the page, gets the HTML.
    Works only in Win32 with Python Win32com extensions enabled.
    Needs IE. Why? If you’re forced to work with Brain-dead 
    closed sourceapplications that go to tremendous length to deliver
    output specific to browsers; and the application has no interface
    other than a browser; and you want get data into a CSV or XML
    for further analysis;
    Note: IE internally formats all HTML to stupid mixed-case, no-
    quotes-around-attributes syntax. So if you are planning to parse
    the data, make sure you study the output of this function rather
    than looking at View-source alone.
    """

    #if you are calling this function in a loop, it is more
    #efficient to open ie once at the beginning, outside this
    #function and then use the same instance to go to url’s
    ie = win32com.client.Dispatch("InternetExplorer.Application")

    ie.Visible = 1 #make this 0, if you want to hide IE window
    #IE started
    ie.Navigate(url)
    #it takes a little while for page to load. sometimes takes 5 sec.
    if ie.Busy:
        sleep(5)
    #now, we got the page loaded and DOM is filled up
    #so get the text
    text = ie.Document.body.innerHTML
    #text is in unicode, so get it into a string
    text = unicode(text)
    text = text.encode('ascii','ignore')
    #save some memory by quitting IE! **very important** 
    ie.Quit()
    #return text
    print text
download_url_with_ie('http://www.goermezer.de')

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.