report_app/app/tools/docx2html.py

import mammoth
import logging

from docx import Document
from bs4 import BeautifulSoup

from app.tools.beautiful_html import beautiful_report

# 获取日志记录器
logger = logging.getLogger(__name__)


# 接口目的：为了给前端返回的数据直接是html
def docx2html(file_path):
    try:
        logger.info("进入解析后的html的单元格合并的主方法")

        # 原docx转成的html，其中的合并单元格解析错误
        original_html = all_to_html(file_path)

        # word所有table的html列表
        table_new_list = table_to_html(file_path)

        # 使用BeautifulSoup解析这两个HTML内容
        original_soup = BeautifulSoup(original_html, "html.parser")
        table_old_list = original_soup.find_all("table")

        if len(table_old_list) == len(table_new_list):
            for i in range(len(table_old_list)):
                # 调用合并单元格方法
                table_old_list[i].replace_with(
                    merge_cells(
                        BeautifulSoup(table_new_list[i], "html.parser").find("table")
                    )
                )

        html = original_soup.prettify()

        return beautiful_report(html)
    except Exception as e:
        logger.exception(f"合并单元格主方法执行失败：{e}")


# 将docx解析成html，此步骤不管表格是否解析正确
def all_to_html(docx_file):
    try:
        logger.info("进入通用docx转html方法，此时单元格未合并")
        with open(docx_file, "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file)
            html = result.value
        return html
    except Exception as e:
        logger.exception(f"通用docx转html方法执行失败：{e}")


# 正确解析word中有合并单元格的表格
def table_to_html(docx_file):
    try:
        logger.info("进入正确解析合并的单元格的方法")
        document = Document(docx_file)
        # 将四个表格放到列表里
        table_list = []
        for table in document.tables:
            html = "<table>"
            for row in table.rows:
                html += "<tr>"
                for cell in row.cells:
                    # 这里需要额外逻辑来计算 colspan 和 rowspan
                    # python-docx 并不直接提供合并单元格跨越的行列数，需要自行计算
                    colspan = 1  # 示例值，实际应用中需替换为正确的计算逻辑
                    rowspan = 1  # 同上
                    html += (
                        f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell.text}</td>"
                    )
                html += "</tr>"
            html += "</table>"
            table_list.append(html)

        return table_list
    except Exception as e:
        logger.exception(f"正确解析合并的单元格的方法执行失败：{e}")


# 合并单元格方法
def merge_cells(table):
    try:
        logger.info("进入合并单元格的方法")

        # 获取前两行
        rows = table.find_all("tr")[:2]

        # 记录需要移除的单元格位置
        merge_map = {}

        # 遍历每一行
        for row_idx, row in enumerate(rows):
            cells = row.find_all(["th", "td"])
            for col_idx, cell in enumerate(cells):
                current_cell_text = cell.get_text(strip=True)
                colspan = 1
                rowspan = 1

                # 检查右侧是否有相同文本的单元格
                j = col_idx + 1
                while (
                    j < len(cells)
                    and cells[j].get_text(strip=True) == current_cell_text
                ):
                    colspan += 1
                    # 标记这些单元格将被移除
                    merge_map[(row_idx, j)] = None
                    j += 1

                # 检查下方是否有相同文本的单元格
                i = row_idx + 1
                while i < len(rows):
                    if (
                        col_idx >= len(rows[i].find_all(["th", "td"]))
                        or rows[i].find_all(["th", "td"])[col_idx].get_text(strip=True)
                        != current_cell_text
                    ):
                        break
                    rowspan += 1
                    # 标记这些单元格将被移除
                    merge_map[(i, col_idx)] = None
                    i += 1

                if colspan > 1 or rowspan > 1:
                    if colspan > 1:
                        cell["colspan"] = str(colspan)
                    if rowspan > 1:
                        cell["rowspan"] = str(rowspan)

        # 删除标记为要移除的单元格
        for (row_idx, cell_idx), _ in sorted(merge_map.items(), reverse=True):
            try:
                rows[row_idx].find_all(["th", "td"])[cell_idx].decompose()
            except IndexError:
                continue

        return table

    except Exception as e:
        logger.exception(f"合并单元格的方法执行失败：{e}")


if __name__ == "__main__":
    docx_file = r"E:\work_data\work\三工单日报\20250311\20250311日报\公司全国“两会”保供电期间配网设备运行及三工单监测日报-20250311.docx"

    docx2html(docx_file)