import win32com.client from time import sleep def download_url_with_ie(url): """ Given a url, it starts IE, loads the page, gets the HTML. Works only in Win32 with Python Win32com extensions enabled. Needs IE. Why? If you’re forced to work with Brain-dead closed sourceapplications that go to tremendous length to deliver output specific to browsers; and the application has no interface other than a browser; and you want get data into a CSV or XML for further analysis; Note: IE internally formats all HTML to stupid mixed-case, no- quotes-around-attributes syntax. So if you are planning to parse the data, make sure you study the output of this function rather than looking at View-source alone. """ #if you are calling this function in a loop, it is more #efficient to open ie once at the beginning, outside this #function and then use the same instance to go to url’s ie = win32com.client.Dispatch("InternetExplorer.Application") ie.Visible = 1 #make this 0, if you want to hide IE window #IE started ie.Navigate(url) #it takes a little while for page to load. sometimes takes 5 sec. if ie.Busy: sleep(5) #now, we got the page loaded and DOM is filled up #so get the text text = ie.Document.body.innerHTML #text is in unicode, so get it into a string text = unicode(text) text = text.encode('ascii','ignore') #save some memory by quitting IE! **very important** ie.Quit() #return text print text download_url_with_ie('http://www.goermezer.de')