|
Form extractor from word to Excel |
|
|
|
Konrads Smelkovs had a bunch of filled out word documents with word forms in them and neded them in Excel. Initially he tried CSV but it didn't play nice with encodings. So he decided to write directly to XLS:
"""
Copyright 2009 Konrads Smelkovs <konrads [at] smelkovs.com>
UTF8Recorder and UnicodeWriter come from python docs
"""
import sys,os,csv
import win32com.client
import pywintypes
class ExcelWriter(object):
def __init__(self,excelfile):
self.excelapp=win32com.client.DispatchEx('Excel.Application')
self.excelapp.Visible=0
self.excelapp.Application.AskToUpdateLinks=0
self.workbook=self.excelapp.Workbooks.Add()
os.unlink(excelfile) #TODO: remove for release
self.workbook.SaveAs(excelfile)
# Only worksheet 1 is used.
self.worksheet=self.workbook.Worksheets.Item(1)
self.currentrow=1
def _getrow(self,row):
"""Convert integer row index to Alphabetical:
1 -> A
2 -> B
...
"""
if row<27:
return chr(ord('A')-1 + row)
else:
first=row / 26
return chr(ord('A')-1 + first) + chr(ord('A')-1 + row % 26)
def __del__(self):
self.workbook.Save()
self.workbook.Close()
self.excelapp.Quit()
def writerow(self,data):
for col in xrange(1,len(data)+1):
range=self._getrow(col)+str(self.currentrow)
print >>sys.stderr,"Range: %s" % range
cell=self.worksheet.Range(range)
cell.Value=data[col-1]
self.currentrow+=1
def main():
if len(sys.argv)<3:
print "Usage: %s <directory> <outfile.csv>" % sys.argv[0]
print "Where <directory> - directory containing word docs with forms"
print "and <outfile.csv> - file where to put results"
sys.exit(-1)
directory=os.path.abspath(sys.argv[1])
wordapp = win32com.client.Dispatch("Word.Application")
wordapp.Visible=0 # Hide word app
results=[]
for docfile in os.listdir(directory):
thisdocresults=[]
if docfile.endswith(".doc") or docfile.endswith(".docx"):
print >> sys.stderr, "Processing %s" % docfile
worddoc=wordapp.Documents.Open(os.path.join(directory,docfile))
for i in range(1,worddoc.FormFields.Count+1):
try:
form=worddoc.FormFields.Item(i)
name=form.Name
value=form.Result
thisdocresults.append((name,value))
try:
print >>sys.stderr, "%s: %s" % (name,value)
except UnicodeEncodeError,e:
print >>sys.stderr, "Error decoding charset,%s" % e
except pywintypes.com_error,e:
print >>sys.stderr, "Exception: %s" % str(e)
results.append(thisdocresults)
worddoc.Close()
wordapp.Quit()
writer=ExcelWriter(os.path.abspath(sys.argv[2]))
print >>sys.stderr,"Writing to Excel"
for docres in results:
data=[]
for (n,v) in docres:
data.append(v)
writer.writerow(data)
if __name__=="__main__":
main() |
|
Last Updated ( Friday, 05 November 2010 )
|