import mammoth import logging from docx import Document from bs4 import BeautifulSoup from app.tools.beautiful_html import beautiful_report # 获取日志记录器 logger = logging.getLogger(__name__) # 接口目的:为了给前端返回的数据直接是html def docx2html(file_path): try: logger.info("进入解析后的html的单元格合并的主方法") # 原docx转成的html,其中的合并单元格解析错误 original_html = all_to_html(file_path) # word所有table的html列表 table_new_list = table_to_html(file_path) # 使用BeautifulSoup解析这两个HTML内容 original_soup = BeautifulSoup(original_html, "html.parser") table_old_list = original_soup.find_all("table") if len(table_old_list) == len(table_new_list): for i in range(len(table_old_list)): # 调用合并单元格方法 table_old_list[i].replace_with( merge_cells( BeautifulSoup(table_new_list[i], "html.parser").find("table") ) ) html = original_soup.prettify() return beautiful_report(html) except Exception as e: logger.exception(f"合并单元格主方法执行失败:{e}") # 将docx解析成html,此步骤不管表格是否解析正确 def all_to_html(docx_file): try: logger.info("进入通用docx转html方法,此时单元格未合并") with open(docx_file, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value return html except Exception as e: logger.exception(f"通用docx转html方法执行失败:{e}") # 正确解析word中有合并单元格的表格 def table_to_html(docx_file): try: logger.info("进入正确解析合并的单元格的方法") document = Document(docx_file) # 将四个表格放到列表里 table_list = [] for table in document.tables: html = "" for row in table.rows: html += "" for cell in row.cells: # 这里需要额外逻辑来计算 colspan 和 rowspan # python-docx 并不直接提供合并单元格跨越的行列数,需要自行计算 colspan = 1 # 示例值,实际应用中需替换为正确的计算逻辑 rowspan = 1 # 同上 html += ( f"" ) html += "" html += "
{cell.text}
" table_list.append(html) return table_list except Exception as e: logger.exception(f"正确解析合并的单元格的方法执行失败:{e}") # 合并单元格方法 def merge_cells(table): try: logger.info("进入合并单元格的方法") # 获取前两行 rows = table.find_all("tr")[:2] # 记录需要移除的单元格位置 merge_map = {} # 遍历每一行 for row_idx, row in enumerate(rows): cells = row.find_all(["th", "td"]) for col_idx, cell in enumerate(cells): current_cell_text = cell.get_text(strip=True) colspan = 1 rowspan = 1 # 检查右侧是否有相同文本的单元格 j = col_idx + 1 while ( j < len(cells) and cells[j].get_text(strip=True) == current_cell_text ): colspan += 1 # 标记这些单元格将被移除 merge_map[(row_idx, j)] = None j += 1 # 检查下方是否有相同文本的单元格 i = row_idx + 1 while i < len(rows): if ( col_idx >= len(rows[i].find_all(["th", "td"])) or rows[i].find_all(["th", "td"])[col_idx].get_text(strip=True) != current_cell_text ): break rowspan += 1 # 标记这些单元格将被移除 merge_map[(i, col_idx)] = None i += 1 if colspan > 1 or rowspan > 1: if colspan > 1: cell["colspan"] = str(colspan) if rowspan > 1: cell["rowspan"] = str(rowspan) # 删除标记为要移除的单元格 for (row_idx, cell_idx), _ in sorted(merge_map.items(), reverse=True): try: rows[row_idx].find_all(["th", "td"])[cell_idx].decompose() except IndexError: continue return table except Exception as e: logger.exception(f"合并单元格的方法执行失败:{e}") if __name__ == "__main__": docx_file = r"E:\work_data\work\三工单日报\20250311\20250311日报\公司全国“两会”保供电期间配网设备运行及三工单监测日报-20250311.docx" docx2html(docx_file)