gcgj-dify-1.7.0/api/services/ext/read_file_service.py

class ReadPdfService:

    @classmethod
    def load_content(cls, pdf_file_path: str) -> str | None:
        doc = None
        try:
            # PDF标题提取（需要pymupdf库）
            import fitz
            doc = fitz.open(pdf_file_path)
            contents = []
            for page in doc:
                page_height = page.rect.height
                page_width = page.rect.width
                # 旧版本中 Page 对象可能没有 get_text 方法，使用 getText 方法替代
                # 从 v1.21.0 版本开始，fitz.Page 类的 getText 方法已被弃用，应使用 get_text 方法
                blocks = page.get_text("dict")["blocks"]  # type: ignore
                page_number = page.number
                # if page_number == 39:
                #     print("-----------------------------")
                blocks = cls.handle_blocks(blocks)
                for block in blocks:
                    # if page_number == 39:
                    #     print("-----------------------------")
                    #     print(block)
                    type = block["type"]
                    if type == 1:
                        continue
                    content = cls.get_block_content(block,page_width,page_height)
                    if content is not None:
                        # if "图5-5" in content:
                        #     # print(content)
                        #     print("aaaaaaaaa")
                        contents.append(content)
            return "\n".join(contents)
        finally:
            if doc is not None:
                doc.close()

    @classmethod
    def handle_blocks(cls, blocks: list[dict]) -> list[dict] | None:
        if blocks is not None:
            handle_block_list = cls.sort_blocks(blocks)
            handle_block_list = cls.filter_inner_img_block(handle_block_list)
            return handle_block_list
        return blocks

    @classmethod
    def sort_blocks(cls, blocks: list[dict]) -> list[dict] | None:
        if blocks is not None:
            def custom_sorted(block:dict) -> float:
                bbox = block["bbox"]
                return bbox[1]
            sorted_data_asc = sorted(blocks, key=custom_sorted)
            return sorted_data_asc
        return blocks

    @classmethod
    def handle_lines(cls, blocks: list[dict]) -> list[dict] | None:
        if blocks is not None:

            def top_sorted(block:dict) -> float:
                bbox = block["bbox"]
                return bbox[1]

            def left_sorted(block:dict) -> float:
                bbox = block["bbox"]
                return bbox[0]
            sorted_data_asc = sorted(blocks, key=top_sorted)
            sorted_data_asc = sorted(sorted_data_asc, key=left_sorted)

            return sorted_data_asc
        return blocks

    @classmethod
    def is_inner_img_block(cls, block: dict, img_bboxs: list[dict]) -> bool:
        is_inner_img = False
        type = block["type"]
        if type != 1:
            for img_bbox in img_bboxs:
                if not is_inner_img:
                    bbox = block["bbox"]
                    is_inner_ = (bbox[0] >= img_bbox[0]
                                 and bbox[1] >= img_bbox[1]
                                 and bbox[2] <= img_bbox[2]
                                 and bbox[3] <= img_bbox[3])
                    if is_inner_:
                        is_inner_img = True
        return is_inner_img

    @classmethod
    def get_only_row_img_bboxs(cls, blocks: list[dict],img_bboxs: list[dict]) -> list[dict]:
        # 判断图片是否是单独一行，单独一行的图片，内部的文字正常处理，反之，不处理（如果两个图片并列的话，内部的文字也是正常处理）
        only_row_img_bboxs = []
        for img_bbox in img_bboxs:
            # 同层级是否只有图片
            is_only_img = True
            for block in blocks:
                type = block["type"]
                if type != 1:
                    bbox = block["bbox"]
                    # 判断当前是否是不在图片内的文本
                    is_inner_img = cls.is_inner_img_block(block, img_bboxs)
                    # 判断当前是否在同层级
                    is_save_level = (
                        (
                            bbox[0] > img_bbox[2]
                            or bbox[2] < img_bbox[0]
                        )
                        and bbox[1] >= img_bbox[1]
                        and bbox[3] <= img_bbox[3]
                    )
                    if not is_inner_img and is_save_level:
                        is_only_img = False
            if is_only_img:
                only_row_img_bboxs.append(img_bbox)
        return only_row_img_bboxs

    @classmethod
    def filter_inner_img_block(cls, blocks: list[dict]) -> list[dict] | None:
        if blocks is not None:
            # 判断是否有图片
            img_bboxs:list[dict] = []
            for block in blocks:
                bbox = block["bbox"]
                type = block["type"]
                if type == 1:
                    img_bboxs.append(bbox)
            if len(img_bboxs) > 0:
                # 获取所有单独在一行的图片区域集合
                only_row_img_bboxs = cls.get_only_row_img_bboxs(blocks, img_bboxs)
                filter_blocks: list[dict] = []
                for block in blocks:
                    type = block["type"]
                    if type != 1:
                        # 判断是否在单行都是图片的区域
                        is_only_row_img = cls.is_inner_img_block(block, only_row_img_bboxs)
                        # 判断是否在图片区域内
                        is_inner_img = cls.is_inner_img_block(block, img_bboxs)
                        # 如果在单行都是图片的区域内，返回值。或者不在图片区域内，返回值。
                        if is_only_row_img or not is_inner_img:
                            filter_blocks.append(block)
                        # else:
                        #     content_ = cls.load_block_content(block)
                        #     print(content_)
                return filter_blocks
        return blocks

    @classmethod
    def get_block_content(cls, block: dict, page_width: float, page_height : float) -> str | None:

        header = cls.is_header_fitz(block=block, page_width=page_width, page_height=page_height)
        footer = cls.is_footer_fitz(block=block, page_width=page_width, page_height=page_height)

        if not header and not footer:
            return cls.load_block_content(block=block)
        return None

    @classmethod
    def load_block_content(cls, block: dict) -> str | None:
        if "lines" in block:
            line_texts = []
            lines = cls.handle_lines(block["lines"])
            for line in lines:
                texts = []
                for span in line["spans"]:
                    text = span["text"].strip()
                    if text:
                        texts.append(text)
                if len(texts) > 0:
                    line_text = "".join(texts)
                    line_texts.append(line_text)
            if len(line_texts) > 0:
                # print("************************************",len(line_texts))
                # print("\n".join(line_texts))
                return "\n".join(line_texts)
        return None

    @classmethod
    def is_heading_fitz(cls, span: dict, page_width: float) -> bool:
        """
        判断一个文本片段是否为标题
        :param span: PyMuPDF 返回的文本片段信息
        :param page_width: 页面宽度（用于判断位置）
        :return: 是否为标题
        """
        # 特征 1：字体加粗
        is_bold = "bold" in span["font"].lower()

        # 特征 2：字体大小（相对较大）
        is_large_font = span["size"] > 14  # 适当降低阈值

        # 特征 3：文本位置（靠近页面顶部或居中）
        is_top = span["origin"][1] < 100  # 距离页面顶部小于 100 像素
        is_centered = abs(span["origin"][0] - page_width / 2) < 50  # 水平居中

        # 特征 4：文本格式（包含大写字母或编号）
        text = span["text"].strip()
        is_uppercase = text.isupper()
        is_numbered = any(text.startswith(f"{i}.") for i in range(1, 10))  # 如 "1.", "2."

        # 综合判断
        # return is_bold or is_large_font
        # flg = (is_bold or is_large_font) and (is_top or is_centered) and (is_uppercase or is_numbered)
        flg = is_numbered
        if flg :
            print("标题：",text)
        return flg

    @classmethod
    def is_top_fitz(cls, bbox_top: float) -> bool:
        # print("----------------",bbox_top)
        if bbox_top < 58:
            return True
        return False

    @classmethod
    def is_bottom_fitz(cls, bbox_bottom: float, page_height) -> bool:
        # print("----------------",bbox_top)
        if bbox_bottom > page_height - 60:
            return True
        return False

    @classmethod
    def is_centered_fitz(cls, origin_x: float, page_width: float) -> bool:
        is_centered = abs(origin_x - page_width / 2) < 20  # 水平居中
        return is_centered

    @classmethod
    def get_origin_bybbox(cls,bbox:list[float]) -> list[float]:
        x = (bbox[0] + bbox[2]) / 2
        y = (bbox[1] + bbox[3]) / 2
        origin = list([x,y])
        return origin

    @classmethod
    def get_first_span(cls,block:dict) -> dict | None:
        if "lines" in block and len(block["lines"]) > 0:
            lines = block["lines"]
            line = lines[0]
            return line["spans"][0]
        return None

    @classmethod
    def is_bold_byspan(cls,span:dict) -> bool:
        is_bold = "bold" in span["font"].lower()
        return is_bold

    @classmethod
    def is_header_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
        # 判断block
        if block:
            bbox = block["bbox"]
            # number = block["number"]
            origin = cls.get_origin_bybbox(bbox)
            if "lines" in block and len(block["lines"]) == 1:
                span = cls.get_first_span(block)
                # 是否加粗
                is_bold = cls.is_bold_byspan(span)
                # 判断字体是否比较小
                is_font_size = span["size"] < 10
                # 不满行
                is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width  -120))
                # 靠近页面顶部
                is_top = cls.is_top_fitz(bbox_top = bbox[1])  # 距离页面顶部小于 100 像素
                # 水平居中
                is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
                # 在判断字体是否加粗，或者字体大小，一般页眉页脚的字体比较小
                return is_top or ( bbox[1] < 70 and not is_bold and is_font_size and is_centered and is_not_full_line  )
        return False

    @classmethod
    def is_footer_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
        # 判断block
        if block:
            bbox = block["bbox"]
            # number = block["number"]
            origin = cls.get_origin_bybbox(bbox)
            if "lines" in block :
                span = cls.get_first_span(block)
                # 是否加粗
                is_bold = cls.is_bold_byspan(span)
                # 判断字体是否比较小
                is_font_size = span["size"] < 10
                # 不满行
                is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width  -120))
                # 靠近页面顶部
                is_bottom = cls.is_bottom_fitz(bbox_bottom = bbox[3],page_height=page_height)  # 距离页面顶部小于 100 像素
                # 水平居中
                is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
                # 在判断字体是否加粗，或者字体大小，一般页眉页脚的字体比较小
                return is_bottom or ( bbox[3] > page_height - 70 and not is_bold and is_font_size and is_centered and is_not_full_line  )
        return False

if __name__ == "__main__":
    readPdfService = ReadPdfService()
    content = readPdfService.load_content(pdf_file_path=r"D:\a.pdf")
    # print(content)
    print(content)
    # PyPdfService.get_headline_page_dictionary(pdf_file_path=r"D:\a.pdf")