#! /usr/bin/env python import argparse import re import xml.etree.ElementTree as ET import zipfile import os import sys nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} def process_args(): parser = argparse.ArgumentParser(description='A pure python-based utility ' 'to extract text and images ' 'from docx files.') parser.add_argument("docx", help="path of the docx file") parser.add_argument('-i', '--img_dir', help='path of directory ' 'to extract images') args = parser.parse_args() if not os.path.exists(args.docx): print('File {} does not exist.'.format(args.docx)) sys.exit(1) if args.img_dir is not None: if not os.path.exists(args.img_dir): try: os.makedirs(args.img_dir) except OSError: print("Unable to create img_dir {}".format(args.img_dir)) sys.exit(1) return args def qn(tag): """ Stands for 'qualified name', a utility function to turn a namespace prefixed tag name into a Clark-notation qualified tag name for lxml. For example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. Source: https://github.com/python-openxml/python-docx/ """ prefix, tagroot = tag.split(':') uri = nsmap[prefix] return '{{{}}}{}'.format(uri, tagroot) def xml2text(xml): """ A string representing the textual content of this run, with content child elements like ```` translated to their Python equivalent. Adapted from: https://github.com/python-openxml/python-docx/ """ text = u'' root = ET.fromstring(xml) for child in root.iter(): if child.tag == qn('w:t'): t_text = child.text text += t_text if t_text is not None else '' elif child.tag == qn('w:tab'): text += '\t' elif child.tag in (qn('w:br'), qn('w:cr')): text += '\n' elif child.tag == qn("w:p"): text += '\n\n' return text def process(docx, img_dir=None): text = u'' # unzip the docx in memory zipf = zipfile.ZipFile(docx) filelist = zipf.namelist() # get header text # there can be 3 header files in the zip header_xmls = 'word/header[0-9]*.xml' for fname in filelist: if re.match(header_xmls, fname): text += xml2text(zipf.read(fname)) # get main text doc_xml = 'word/document.xml' text += xml2text(zipf.read(doc_xml)) # get footer text # there can be 3 footer files in the zip footer_xmls = 'word/footer[0-9]*.xml' for fname in filelist: if re.match(footer_xmls, fname): text += xml2text(zipf.read(fname)) if img_dir is not None: # extract images for fname in filelist: _, extension = os.path.splitext(fname) if extension in [".jpg", ".jpeg", ".png", ".bmp"]: dst_fname = os.path.join(img_dir, os.path.basename(fname)) with open(dst_fname, "wb") as dst_f: dst_f.write(zipf.read(fname)) zipf.close() return text.strip() if __name__ == '__main__': args = process_args() text = process(args.docx, args.img_dir) sys.stdout.write(text.encode('utf-8'))