doc = docx.Document(file) for table in doc.tables: cell_text = table.rows[0].cells[0].text for row in table.rows: for idx, cell in enumerate(row.cells): next_idx = idx + cell.grid_span
扫描excel文件
def init(): """ 1、获取word文件列表 :return: 无 """ global result_excel global sheet3 global sheet2 global sheet1 global sheet3_column_list global sheet2_column_list global sheet1_column_list
for file in os.listdir(sheet1_2_dir): if (file.endswith(".docx") or file.endswith(".doc")) and not file.startswith("~$"): sheet1_2_file_list.append(os.path.join(sheet1_2_dir, file)) # print("sheet1_2_file_list",sheet1_2_file_list)
for file in os.listdir(sheet3_dir): if (file.endswith(".docx") or file.endswith(".doc")) and not file.startswith("~$"): sheet3_file_list.append(os.path.join(sheet3_dir, file)) # print("sheet3_file_list", sheet3_file_list)
# 读取结果excel文件 result_excel = openpyxl.load_workbook(result_file_path) sheet1 = result_excel.worksheets[0] sheet2 = result_excel.worksheets[1] sheet3 = result_excel.worksheets[2] # print(len(list(sheet1.rows)),type(sheet1.rows)) for cell in list(list(sheet1.rows)[1])[1:]: sheet1_column_list.append(cell.value) for cell in list(list(sheet2.rows)[1])[1:]: sheet2_column_list.append(cell.value) for cell in list(list(sheet3.rows)[1])[1:]: sheet3_column_list.append(cell.value)
# 处理sheet1和2 print("【开始】处理sheet1和2") for file in sheet1_2_file_list: print("处理文件:", file)
# # 测试 # init2()
doc = docx.Document(file) for table in doc.tables: parse_table_result = parse_table(table) if parse_table_result is not None: write_sheet(parse_table_result[0], parse_table_result[1]) # # 测试 # filename, extension = os.path.splitext(os.path.basename(file)) # save_wb(result_excel,os.path.join("result\sheet1_2",filename+".xlsx"))
print("【结束】处理sheet1和2")
# 处理sheet3 print("【开始】处理sheet3") for file in sheet3_file_list: print("处理文件:", file)
# # 测试 # init2()
doc = docx.Document(file) for table in doc.tables: parse_table_result = parse_table(table) if parse_table_result is not None: write_sheet(parse_table_result[0], parse_table_result[1])
if 1 == judge_sheet_type(cell_text): # 第一个sheet的表格 # print("处理第一个sheet的表") i = 0 data = [] for row in table.rows: for idx, cell in enumerate(row.cells): next_idx = idx + cell.grid_span tmp_cell_text = cell.text if i == len(sheet1_column_list): # sheet1中字段全部解析成功,返回结果 # print("解析sheet1表格成功") # print(list(zip(sheet1_column_list, data))) return 1, data
column = sheet1_column_list[i] if"长(m)" == column: if hasattr(cell, "text") and "长"in tmp_cell_text: data.append(row.cells[next_idx].text) i += 1 continue if"最径(m)" == column: if hasattr(cell, "text") and ("最径"in tmp_cell_text or "跨径组合"in tmp_cell_text): data.append(row.cells[next_idx].text) i += 1 continue
if column == tmp_cell_text: data.append(row.cells[next_idx].text) i += 1 continue if i # 表格遍历结束,sheet1字段没有遍历完,代表该表格 print("解析sheet1表格失败") print(list(zip(sheet1_column_list, data))) return None return 1, data
if 2 == judge_sheet_type(cell_text): # 第二个sheet的表格 # print("处理第二个sheet的表") i = 0 data = [] for row in table.rows: for idx, cell in enumerate(row.cells): next_idx = idx + cell.grid_span if i == len(sheet2_column_list): # sheet1中字段全部解析成功,返回结果 # print(list(zip(sheet2_column_list,data))) return 2, data
column = sheet2_column_list[i]
if"涵评" == column: if"涵总评"in cell.text: # data.append(row.cells[idx+3].text) data.append(row.cells[next_idx].text) i += 1 continue if"路编号" == column: if"路编号"in cell.text: # data.append(row.cells[idx+1].text) data.append(row.cells[next_idx].text) i += 1 continue if column in cell.text: # data.append(row.cells[idx + 2].text) data.append(row.cells[next_idx].text) i += 1 continue # print(data) if i # 表格遍历结束,sheet1字段没有遍历完,代表该表格 print("解析sheet2表格失败") print(list(zip(sheet2_column_list, data))) return None return 2, data
if 3 == judge_sheet_type(cell_text): # 第三个sheet的表格 # print("处理第三个sheet的表") i = 0 data = [] for j,row in enumerate(table.rows): for idx, cell in enumerate(row.cells): next_idx = idx + cell.grid_span if i == len(sheet3_column_list): # sheet中字段全部解析成功,返回结果 # print(list(zip(sheet3_column_list, data))) return 3, data
next7_list = ['长(m)', '宽(m)', '', '',''] if column in next7_list: if column in cell.text: # data.append(row.cells[next7_idx].text) data.append(row.cells[next_idx].text) i += 1 continue
if"评时间" == column: if"评时间" == cell.text: # print("处理评定时间字段") # 获取其下面所有的时间和所有的检查结论 time_str = "" check_result_str = ""
k = 1 while True: tmp_cell = table.cell(j+k, idx) tmp_cell_text = tmp_cell.text tmp_cell_text = tmp_cell_text.encode("gb2312", errors='ignore').decode("gb2312") # print("tmp_cell", tmp_cell_text) r = re.fullmatch(r'\d{4}', tmp_cell_text) if r is not None: time_str += tmp_cell_text time_str += "、"
data.append(time_str) i += 1 data.append(check_result_str) i += 1 continue
if column in cell.text: data.append(row.cells[next5_idx].text) i += 1 continue # print(data) if i # 表格遍历结束,sheet1字段没有遍历完,代表该表格 print("解析sheet3表格失败") print(list(zip(sheet3_column_list, data))) return None return 3, data