|
|
import mammoth
|
|
|
import logging
|
|
|
|
|
|
from docx import Document
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from app.tools.beautiful_html import beautiful_report
|
|
|
|
|
|
# 获取日志记录器
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
# 接口目的:为了给前端返回的数据直接是html
|
|
|
def docx2html(file_path):
|
|
|
try:
|
|
|
logger.info("进入解析后的html的单元格合并的主方法")
|
|
|
|
|
|
# 原docx转成的html,其中的合并单元格解析错误
|
|
|
original_html = all_to_html(file_path)
|
|
|
|
|
|
# word所有table的html列表
|
|
|
table_new_list = table_to_html(file_path)
|
|
|
|
|
|
# 使用BeautifulSoup解析这两个HTML内容
|
|
|
original_soup = BeautifulSoup(original_html, "html.parser")
|
|
|
table_old_list = original_soup.find_all("table")
|
|
|
|
|
|
if len(table_old_list) == len(table_new_list):
|
|
|
for i in range(len(table_old_list)):
|
|
|
# 调用合并单元格方法
|
|
|
table_old_list[i].replace_with(
|
|
|
merge_cells(
|
|
|
BeautifulSoup(table_new_list[i], "html.parser").find("table")
|
|
|
)
|
|
|
)
|
|
|
|
|
|
html = original_soup.prettify()
|
|
|
|
|
|
return beautiful_report(html)
|
|
|
except Exception as e:
|
|
|
logger.exception(f"合并单元格主方法执行失败:{e}")
|
|
|
|
|
|
|
|
|
# 将docx解析成html,此步骤不管表格是否解析正确
|
|
|
def all_to_html(docx_file):
|
|
|
try:
|
|
|
logger.info("进入通用docx转html方法,此时单元格未合并")
|
|
|
with open(docx_file, "rb") as docx_file:
|
|
|
result = mammoth.convert_to_html(docx_file)
|
|
|
html = result.value
|
|
|
return html
|
|
|
except Exception as e:
|
|
|
logger.exception(f"通用docx转html方法执行失败:{e}")
|
|
|
|
|
|
|
|
|
# 正确解析word中有合并单元格的表格
|
|
|
def table_to_html(docx_file):
|
|
|
try:
|
|
|
logger.info("进入正确解析合并的单元格的方法")
|
|
|
document = Document(docx_file)
|
|
|
# 将四个表格放到列表里
|
|
|
table_list = []
|
|
|
for table in document.tables:
|
|
|
html = "<table>"
|
|
|
for row in table.rows:
|
|
|
html += "<tr>"
|
|
|
for cell in row.cells:
|
|
|
# 这里需要额外逻辑来计算 colspan 和 rowspan
|
|
|
# python-docx 并不直接提供合并单元格跨越的行列数,需要自行计算
|
|
|
colspan = 1 # 示例值,实际应用中需替换为正确的计算逻辑
|
|
|
rowspan = 1 # 同上
|
|
|
html += (
|
|
|
f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell.text}</td>"
|
|
|
)
|
|
|
html += "</tr>"
|
|
|
html += "</table>"
|
|
|
table_list.append(html)
|
|
|
|
|
|
return table_list
|
|
|
except Exception as e:
|
|
|
logger.exception(f"正确解析合并的单元格的方法执行失败:{e}")
|
|
|
|
|
|
|
|
|
# 合并单元格方法
|
|
|
def merge_cells(table):
|
|
|
try:
|
|
|
logger.info("进入合并单元格的方法")
|
|
|
|
|
|
# 获取前两行
|
|
|
rows = table.find_all("tr")[:2]
|
|
|
|
|
|
# 记录需要移除的单元格位置
|
|
|
merge_map = {}
|
|
|
|
|
|
# 遍历每一行
|
|
|
for row_idx, row in enumerate(rows):
|
|
|
cells = row.find_all(["th", "td"])
|
|
|
for col_idx, cell in enumerate(cells):
|
|
|
current_cell_text = cell.get_text(strip=True)
|
|
|
colspan = 1
|
|
|
rowspan = 1
|
|
|
|
|
|
# 检查右侧是否有相同文本的单元格
|
|
|
j = col_idx + 1
|
|
|
while (
|
|
|
j < len(cells)
|
|
|
and cells[j].get_text(strip=True) == current_cell_text
|
|
|
):
|
|
|
colspan += 1
|
|
|
# 标记这些单元格将被移除
|
|
|
merge_map[(row_idx, j)] = None
|
|
|
j += 1
|
|
|
|
|
|
# 检查下方是否有相同文本的单元格
|
|
|
i = row_idx + 1
|
|
|
while i < len(rows):
|
|
|
if (
|
|
|
col_idx >= len(rows[i].find_all(["th", "td"]))
|
|
|
or rows[i].find_all(["th", "td"])[col_idx].get_text(strip=True)
|
|
|
!= current_cell_text
|
|
|
):
|
|
|
break
|
|
|
rowspan += 1
|
|
|
# 标记这些单元格将被移除
|
|
|
merge_map[(i, col_idx)] = None
|
|
|
i += 1
|
|
|
|
|
|
if colspan > 1 or rowspan > 1:
|
|
|
if colspan > 1:
|
|
|
cell["colspan"] = str(colspan)
|
|
|
if rowspan > 1:
|
|
|
cell["rowspan"] = str(rowspan)
|
|
|
|
|
|
# 删除标记为要移除的单元格
|
|
|
for (row_idx, cell_idx), _ in sorted(merge_map.items(), reverse=True):
|
|
|
try:
|
|
|
rows[row_idx].find_all(["th", "td"])[cell_idx].decompose()
|
|
|
except IndexError:
|
|
|
continue
|
|
|
|
|
|
return table
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.exception(f"合并单元格的方法执行失败:{e}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
docx_file = r"E:\work_data\work\三工单日报\20250311\20250311日报\公司全国“两会”保供电期间配网设备运行及三工单监测日报-20250311.docx"
|
|
|
|
|
|
docx2html(docx_file)
|