Source code for docx2csv.converter

# -*- coding: utf8 -*-

import csv
import xlwt
import openpyxl

from docx import Document
from docx.table import _Cell
from docx.oxml.simpletypes import ST_Merge


def __extract_table(table):
    """Extracts table data from table object"""
    results = []
    n = 0
    for tr in table._tbl.tr_lst:
        r = []
        for tc in tr.tc_lst:
            for grid_span_idx in range(tc.grid_span):
                if tc.vMerge == ST_Merge.CONTINUE:
                    r.append(results[n - 1][len(r) - 1])#.decode('utf8'))
                elif grid_span_idx > 0:
                    r.append(r[-1])#.decode('utf8'))
                else:
                    cell = _Cell(tc, table)
                    r.append(cell.text.replace("\n", " "))#.decode('utf8'))
        results.append(r)
#        print(r)
        n += 1
    return results


def __store_table(tabdata, filename, format="csv"):
    """Saves table data as csv file"""
    if format == "csv":
        f = open(filename, "w", encoding='utf8')
        w = csv.writer(f, delimiter=",")
        for row in tabdata:
            w.writerow(row)
    elif format == 'tsv':
        f = open(filename, 'w')
        w = csv.writer(f, delimiter='\t')
        for row in tabdata:
            w.writerow(row)
    elif format == 'xls':
        workbook = xlwt.Workbook()
        ws = __xls_table_to_sheet(tabdata, workbook.add_sheet("0"))
#        print(dir(ws))
        workbook.save(filename)
    elif format == "xlsx":
        workbook = openpyxl.Workbook()
        ws = __xlsx_table_to_sheet(tabdata, workbook.create_sheet("0"))
        workbook.save(filename)

def __xls_table_to_sheet(table, ws):
    rn = 0
    for row in table:
        cn = 0
        for c in row:
            ws.write(rn, cn, c)
            cn += 1
        rn += 1
    return ws


def __xlsx_table_to_sheet(table, ws):
    rn = 0
    for row in table:
        ws.append(row)
        rn += 1
    return ws


[docs]def extract_tables(filename): """Extracts table from .DOCX files""" tables = [] document = Document(filename) n = 0 for table in document.tables: n += 1 tdata = __extract_table(table) tables.append(tdata) return tables
[docs]def extract(filename, format="csv", sizefilter=0, singlefile=False, output=None): """Extracts tables from csv files and saves them as csv, xls or xlsx files""" tables = extract_tables(filename) name = filename.rsplit(".", 1)[0] format = format.lower() n = 0 lfilter = int(sizefilter) if singlefile: if format == "xls": workbook = xlwt.Workbook() for t in tables: if lfilter >= len(t): continue n += 1 ws = __xls_table_to_sheet(t, workbook.add_sheet(str(n))) destname = output if output else name + ".%s" % (format) workbook.save(destname) elif format == "xlsx": workbook = openpyxl.Workbook() for t in tables: if lfilter >= len(t): continue n += 1 ws = __xlsx_table_to_sheet(t, workbook.create_sheet(str(n))) destname = output if output else name + ".%s" % (format) workbook.save(destname) else: for t in tables: if lfilter >= len(t): continue n += 1 destname = output if output else name + "_%d.%s" % (n, format) __store_table(t, destname, format)