You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
report_app/app/tools/docx2html.py

151 lines
5.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import mammoth
import logging
from docx import Document
from bs4 import BeautifulSoup
from app.tools.beautiful_html import beautiful_report
# 获取日志记录器
logger = logging.getLogger(__name__)
# 接口目的为了给前端返回的数据直接是html
def docx2html(file_path):
try:
logger.info("进入解析后的html的单元格合并的主方法")
# 原docx转成的html其中的合并单元格解析错误
original_html = all_to_html(file_path)
# word所有table的html列表
table_new_list = table_to_html(file_path)
# 使用BeautifulSoup解析这两个HTML内容
original_soup = BeautifulSoup(original_html, "html.parser")
table_old_list = original_soup.find_all("table")
if len(table_old_list) == len(table_new_list):
for i in range(len(table_old_list)):
# 调用合并单元格方法
table_old_list[i].replace_with(
merge_cells(
BeautifulSoup(table_new_list[i], "html.parser").find("table")
)
)
html = original_soup.prettify()
return beautiful_report(html)
except Exception as e:
logger.exception(f"合并单元格主方法执行失败:{e}")
# 将docx解析成html此步骤不管表格是否解析正确
def all_to_html(docx_file):
try:
logger.info("进入通用docx转html方法此时单元格未合并")
with open(docx_file, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value
return html
except Exception as e:
logger.exception(f"通用docx转html方法执行失败{e}")
# 正确解析word中有合并单元格的表格
def table_to_html(docx_file):
try:
logger.info("进入正确解析合并的单元格的方法")
document = Document(docx_file)
# 将四个表格放到列表里
table_list = []
for table in document.tables:
html = "<table>"
for row in table.rows:
html += "<tr>"
for cell in row.cells:
# 这里需要额外逻辑来计算 colspan 和 rowspan
# python-docx 并不直接提供合并单元格跨越的行列数,需要自行计算
colspan = 1 # 示例值,实际应用中需替换为正确的计算逻辑
rowspan = 1 # 同上
html += (
f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell.text}</td>"
)
html += "</tr>"
html += "</table>"
table_list.append(html)
return table_list
except Exception as e:
logger.exception(f"正确解析合并的单元格的方法执行失败:{e}")
# 合并单元格方法
def merge_cells(table):
try:
logger.info("进入合并单元格的方法")
# 获取前两行
rows = table.find_all("tr")[:2]
# 记录需要移除的单元格位置
merge_map = {}
# 遍历每一行
for row_idx, row in enumerate(rows):
cells = row.find_all(["th", "td"])
for col_idx, cell in enumerate(cells):
current_cell_text = cell.get_text(strip=True)
colspan = 1
rowspan = 1
# 检查右侧是否有相同文本的单元格
j = col_idx + 1
while (
j < len(cells)
and cells[j].get_text(strip=True) == current_cell_text
):
colspan += 1
# 标记这些单元格将被移除
merge_map[(row_idx, j)] = None
j += 1
# 检查下方是否有相同文本的单元格
i = row_idx + 1
while i < len(rows):
if (
col_idx >= len(rows[i].find_all(["th", "td"]))
or rows[i].find_all(["th", "td"])[col_idx].get_text(strip=True)
!= current_cell_text
):
break
rowspan += 1
# 标记这些单元格将被移除
merge_map[(i, col_idx)] = None
i += 1
if colspan > 1 or rowspan > 1:
if colspan > 1:
cell["colspan"] = str(colspan)
if rowspan > 1:
cell["rowspan"] = str(rowspan)
# 删除标记为要移除的单元格
for (row_idx, cell_idx), _ in sorted(merge_map.items(), reverse=True):
try:
rows[row_idx].find_all(["th", "td"])[cell_idx].decompose()
except IndexError:
continue
return table
except Exception as e:
logger.exception(f"合并单元格的方法执行失败:{e}")
if __name__ == "__main__":
docx_file = r"E:\work_data\work\三工单日报\20250311\20250311日报\公司全国“两会”保供电期间配网设备运行及三工单监测日报-20250311.docx"
docx2html(docx_file)