Py学习  »  Python

代码分享,300元Python副业单,解析Word文件中的多个Excel表格

蚂蚁学Python • 1 年前 • 632 次点击  

总结下知识,这两句代码,可以得到一个word文件中的多个excel文件:

doc = docx.Document(file)
for table in doc.tables:
  cell_text = table.rows[0].cells[0].text
  for row in table.rows:
    for idx, cell in enumerate(row.cells):
        next_idx = idx + cell.grid_span

扫描excel文件

def init():
    """
    1、获取word文件列表
    :return: 无
    "
""
    global result_excel
    global sheet3
    global sheet2
    global sheet1
    global sheet3_column_list
    global sheet2_column_list
    global sheet1_column_list

    for file in os.listdir(sheet1_2_dir):
        if (file.endswith(".docx") or file.endswith(".doc")) and not file.startswith("~$"):
            sheet1_2_file_list.append(os.path.join(sheet1_2_dir, file))
    # print("sheet1_2_file_list",sheet1_2_file_list)

    for file in os.listdir(sheet3_dir):
        if (file.endswith(".docx") or file.endswith(".doc")) and not file.startswith("~$"):
            sheet3_file_list.append(os.path.join(sheet3_dir, file))
    # print("sheet3_file_list", sheet3_file_list)

    # 读取结果excel文件
    result_excel = openpyxl.load_workbook(result_file_path)
    sheet1 = result_excel.worksheets[0]
    sheet2 = result_excel.worksheets[1]
    sheet3 = result_excel.worksheets[2]
    # print(len(list(sheet1.rows)),type(sheet1.rows))
    for cell in list(list(sheet1.rows)[1])[1:]:
        sheet1_column_list.append(cell.value)
    for cell in list(list(sheet2.rows)[1])[1:]:
        sheet2_column_list.append(cell.value)
    for cell in list(list(sheet3.rows)[1])[1:]:
        sheet3_column_list.append(cell.value)

    # print(sheet1_column_list)
    # print(sheet2_column_list)
    # print(sheet3_column_list)

读取word文件

可以得到一个word的多个excel文件;

    # 处理sheet1和2
    print("【开始】处理sheet1和2")
    for file in sheet1_2_file_list:
        print("处理文件:", file)

        # # 测试
        # init2()

        doc = docx.Document(file)
        for table in doc.tables:
            parse_table_result = parse_table(table)
            if parse_table_result is not None:
                write_sheet(parse_table_result[0], parse_table_result[1])
        # # 测试
        # filename, extension = os.path.splitext(os.path.basename(file))
        # save_wb(result_excel,os.path.join("result\sheet1_2",filename+".xlsx"))

    print("【结束】处理sheet1和2")

    # 处理sheet3
    print("【开始】处理sheet3")
    for file in sheet3_file_list:
        print("处理文件:", file)

        # # 测试
        # init2()

        doc = docx.Document(file)
        for table in doc.tables:
            parse_table_result = parse_table(table)
            if parse_table_result is not None:
                write_sheet(parse_table_result[0], parse_table_result[1])

        # # 测试
        # filename, extension = os.path.splitext(os.path.basename(file))
        # save_wb(result_excel, os.path.join("result\sheet3", filename+".xlsx"))

    print("【结束】处理sheet3")

    save_wb(result_excel, new_result_file_path)
    print("保存成功")
    input("按回车键退出程序。。。")

解析word文件中的表格

def parse_table(table):
    """
    处理word中table对象,提取数据到结果excel中
    :param table: word中表格对象
    :return: None 或 (sheet_x,data):不是需要解析的表格。sheet_x:: 1:sheet1的表格,2:sheet2的表格,3:sheet3的表格。data::数据列表
    "
""
    # # 测试
    # rows = len(table.rows)  # 获取表格的行数
    # cols = len(table.columns)  # 获取表格的列数
    # print(f"表格大小:行数={rows}, 列数={cols}")
    # for row in table.rows:
    #     for cell in row.cells:
    #         print(cell.text, end=" |")
    #     print()
    #     break
    # print(table.cell(0, 0))

    # try:
    #     cell_text = table.cell(0, 0).text
    # except:
    #     return None

    cell_text = table.rows[0].cells[0].text

    cell_text = cell_text.encode("gb2312", errors='ignore').decode("gb2312")
    # print("cell_text", cell_text, "judge_sheet_type(cell_text)", judge_sheet_type(cell_text))
    if -1 == judge_sheet_type(cell_text):
        # 不属于任何一类sheet,继续检测下一个表格
        return None

    if 1 == judge_sheet_type(cell_text):
        # 第一个sheet的表格
        # print("处理第一个sheet的表")
        i = 0
        data = []
        for row in table.rows:
            for idx, cell in enumerate(row.cells):
                next_idx = idx + cell.grid_span
                tmp_cell_text = cell.text
                if i == len(sheet1_column_list):
                    # sheet1中字段全部解析成功,返回结果
                    # print("解析sheet1表格成功")
                    # print(list(zip(sheet1_column_list, data)))
                    return 1, data

                column = sheet1_column_list[i]
                if "长(m)" == column:
                    if hasattr(cell, "text") and "长" in tmp_cell_text:
                        data.append(row.cells[next_idx].text)
                        i += 1
                        continue
                if "最径(m)" == column:
                    if hasattr(cell, "text") and ("最径" in tmp_cell_text or "跨径组合" in tmp_cell_text):
                        data.append(row.cells[next_idx].text)
                        i += 1
                        continue
                
                if column == tmp_cell_text:
                    data.append(row.cells[next_idx].text)
                    i += 1
                    continue
        if i             # 表格遍历结束,sheet1字段没有遍历完,代表该表格
            print("解析sheet1表格失败")
            print(list(zip(sheet1_column_list, data)))
            return None
        return 1, data

    if 2 == judge_sheet_type(cell_text):
        # 第二个sheet的表格
        # print("处理第二个sheet的表")
        i = 0
        data = []
        for row in table.rows:
            for idx, cell in enumerate(row.cells):
                next_idx = idx + cell.grid_span
                if i == len(sheet2_column_list):
                    # sheet1中字段全部解析成功,返回结果
                    # print(list(zip(sheet2_column_list,data)))
                    return 2, data

                column = sheet2_column_list[i]

                if "涵评" == column:
                    if "涵总评" in cell.text:
                        # data.append(row.cells[idx+3].text)
                        data.append(row.cells[next_idx].text)
                        i += 1
                        continue
                if "路编号" == column:
                    if "路编号" in cell.text:
                        # data.append(row.cells[idx+1].text)
                        data.append(row.cells[next_idx].text)
                        i += 1
                        continue
                if column in cell.text:
                    # data.append(row.cells[idx + 2].text)
                    data.append(row.cells[next_idx].text)
                    i += 1
                    continue
        # print(data)
        if i             # 表格遍历结束,sheet1字段没有遍历完,代表该表格
            print("解析sheet2表格失败")
            print(list(zip(sheet2_column_list, data)))
            return None
        return 2, data

    if 3 == judge_sheet_type(cell_text):
        # 第三个sheet的表格
        # print("处理第三个sheet的表")
        i = 0
        data = []
        for j,row in enumerate(table.rows):
            for idx, cell in enumerate(row.cells):
                next_idx = idx + cell.grid_span
                if i == len(sheet3_column_list):
                    # sheet中字段全部解析成功,返回结果
                    # print(list(zip(sheet3_column_list, data)))
                    return 3, data

                column = sheet3_column_list[i]
                next5_idx = idx + 5
                next7_idx = idx + 7

                next7_list = ['长(m)''宽(m)''''','']
                if column in next7_list:
                    if column in cell.text:
                        # data.append(row.cells[next7_idx].text)
                        data.append(row.cells[next_idx].text)
                        i += 1
                        continue

                if "评时间" == column:
                    if "评时间" == cell.text:
                        # print("处理评定时间字段")
                        # 获取其下面所有的时间和所有的检查结论
                        time_str = ""
                        check_result_str = ""
                        k = 1
                        while True:
                            tmp_cell = table.cell(j+k, idx)
                            tmp_cell_text = tmp_cell.text
                            tmp_cell_text = tmp_cell_text.encode("gb2312", errors='ignore').decode("gb2312")
                            # print("tmp_cell", tmp_cell_text)
                            r = re.fullmatch(r'\d{4}', tmp_cell_text)
                            if r is not None:
                                time_str += tmp_cell_text
                                time_str += "、"

                                tmp2_cell = table.cell(j + k, idx+7)
                                check_result_str += tmp2_cell.text
                                check_result_str += "、"

                                k += 1
                                continue
                            break

                        data.append(time_str)
                        i += 1
                        data.append(check_result_str)
                        i += 1
                        continue

                if column in cell.text:
                    data.append(row.cells[next5_idx].text)
                    i += 1
                    continue
        # print(data)
        if i             # 表格遍历结束,sheet1字段没有遍历完,代表该表格
            print("解析sheet3表格失败")
            print(list(zip(sheet3_column_list, data)))
            return None
        return 3, data

生成结果excel文件


def write_sheet(sheet_x: int, data: list):
    """
    往sheet中写入数据data
    :param sheet_x: 第几个sheet,数字类型:1、2、3
    :param data: 数据列表
    :return: 无
    "
""
    if sheet_x  3:
        return

    my_sheet = ""
    if 1 == sheet_x:
        my_sheet = sheet1
    if 2 == sheet_x:
        my_sheet = sheet2
    if 3 == sheet_x:
        my_sheet = sheet3

    # 获取当前行数
    cur_row_num = len(list(my_sheet.rows))
    # 新增一行数据
    data.insert(0, cur_row_num - 1)
    my_sheet.append(data)


def save_wb(excel: openpyxl.Workbook,new_file: str):
    """
    保存excel
    :param excel: 要保存的excel对象
    :param new_file: 保存到新文件的名称
    :return: 无
    "
""
    excel.save(new_file)

代码分享,百度网盘链接:

通过百度网盘分享的文件:handle_word(1).zip 链接:https://pan.baidu.com/s/1plViPhB4LxjS_8JgfQ8mug?pwd=1b7f 提取码:1b7f --来自百度网盘超级会员V6的分享


学习更多办公自动化技巧,可以学习蚂蚁老师的Pandas视频课程:


Python社区是高质量的Python/Django开发社区
本文地址:http://www.python88.com/topic/172893