You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
gcgj-dify-1.7.0/api/services/ext/read_file_service.py

299 lines
12 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

class ReadPdfService:
@classmethod
def load_content(cls, pdf_file_path: str) -> str | None:
doc = None
try:
# PDF标题提取需要pymupdf库
import fitz
doc = fitz.open(pdf_file_path)
contents = []
for page in doc:
page_height = page.rect.height
page_width = page.rect.width
# 旧版本中 Page 对象可能没有 get_text 方法,使用 getText 方法替代
# 从 v1.21.0 版本开始fitz.Page 类的 getText 方法已被弃用,应使用 get_text 方法
blocks = page.get_text("dict")["blocks"] # type: ignore
page_number = page.number
# if page_number == 39:
# print("-----------------------------")
blocks = cls.handle_blocks(blocks)
for block in blocks:
# if page_number == 39:
# print("-----------------------------")
# print(block)
type = block["type"]
if type == 1:
continue
content = cls.get_block_content(block,page_width,page_height)
if content is not None:
# if "图5-5" in content:
# # print(content)
# print("aaaaaaaaa")
contents.append(content)
return "\n".join(contents)
finally:
if doc is not None:
doc.close()
@classmethod
def handle_blocks(cls, blocks: list[dict]) -> list[dict] | None:
if blocks is not None:
handle_block_list = cls.sort_blocks(blocks)
handle_block_list = cls.filter_inner_img_block(handle_block_list)
return handle_block_list
return blocks
@classmethod
def sort_blocks(cls, blocks: list[dict]) -> list[dict] | None:
if blocks is not None:
def custom_sorted(block:dict) -> float:
bbox = block["bbox"]
return bbox[1]
sorted_data_asc = sorted(blocks, key=custom_sorted)
return sorted_data_asc
return blocks
@classmethod
def handle_lines(cls, blocks: list[dict]) -> list[dict] | None:
if blocks is not None:
def top_sorted(block:dict) -> float:
bbox = block["bbox"]
return bbox[1]
def left_sorted(block:dict) -> float:
bbox = block["bbox"]
return bbox[0]
sorted_data_asc = sorted(blocks, key=top_sorted)
sorted_data_asc = sorted(sorted_data_asc, key=left_sorted)
return sorted_data_asc
return blocks
@classmethod
def is_inner_img_block(cls, block: dict, img_bboxs: list[dict]) -> bool:
is_inner_img = False
type = block["type"]
if type != 1:
for img_bbox in img_bboxs:
if not is_inner_img:
bbox = block["bbox"]
is_inner_ = (bbox[0] >= img_bbox[0]
and bbox[1] >= img_bbox[1]
and bbox[2] <= img_bbox[2]
and bbox[3] <= img_bbox[3])
if is_inner_:
is_inner_img = True
return is_inner_img
@classmethod
def get_only_row_img_bboxs(cls, blocks: list[dict],img_bboxs: list[dict]) -> list[dict]:
# 判断图片是否是单独一行,单独一行的图片,内部的文字正常处理,反之,不处理(如果两个图片并列的话,内部的文字也是正常处理)
only_row_img_bboxs = []
for img_bbox in img_bboxs:
# 同层级是否只有图片
is_only_img = True
for block in blocks:
type = block["type"]
if type != 1:
bbox = block["bbox"]
# 判断当前是否是不在图片内的文本
is_inner_img = cls.is_inner_img_block(block, img_bboxs)
# 判断当前是否在同层级
is_save_level = (
(
bbox[0] > img_bbox[2]
or bbox[2] < img_bbox[0]
)
and bbox[1] >= img_bbox[1]
and bbox[3] <= img_bbox[3]
)
if not is_inner_img and is_save_level:
is_only_img = False
if is_only_img:
only_row_img_bboxs.append(img_bbox)
return only_row_img_bboxs
@classmethod
def filter_inner_img_block(cls, blocks: list[dict]) -> list[dict] | None:
if blocks is not None:
# 判断是否有图片
img_bboxs:list[dict] = []
for block in blocks:
bbox = block["bbox"]
type = block["type"]
if type == 1:
img_bboxs.append(bbox)
if len(img_bboxs) > 0:
# 获取所有单独在一行的图片区域集合
only_row_img_bboxs = cls.get_only_row_img_bboxs(blocks, img_bboxs)
filter_blocks: list[dict] = []
for block in blocks:
type = block["type"]
if type != 1:
# 判断是否在单行都是图片的区域
is_only_row_img = cls.is_inner_img_block(block, only_row_img_bboxs)
# 判断是否在图片区域内
is_inner_img = cls.is_inner_img_block(block, img_bboxs)
# 如果在单行都是图片的区域内,返回值。或者不在图片区域内,返回值。
if is_only_row_img or not is_inner_img:
filter_blocks.append(block)
# else:
# content_ = cls.load_block_content(block)
# print(content_)
return filter_blocks
return blocks
@classmethod
def get_block_content(cls, block: dict, page_width: float, page_height : float) -> str | None:
header = cls.is_header_fitz(block=block, page_width=page_width, page_height=page_height)
footer = cls.is_footer_fitz(block=block, page_width=page_width, page_height=page_height)
if not header and not footer:
return cls.load_block_content(block=block)
return None
@classmethod
def load_block_content(cls, block: dict) -> str | None:
if "lines" in block:
line_texts = []
lines = cls.handle_lines(block["lines"])
for line in lines:
texts = []
for span in line["spans"]:
text = span["text"].strip()
if text:
texts.append(text)
if len(texts) > 0:
line_text = "".join(texts)
line_texts.append(line_text)
if len(line_texts) > 0:
# print("************************************",len(line_texts))
# print("\n".join(line_texts))
return "\n".join(line_texts)
return None
@classmethod
def is_heading_fitz(cls, span: dict, page_width: float) -> bool:
"""
判断一个文本片段是否为标题
:param span: PyMuPDF 返回的文本片段信息
:param page_width: 页面宽度(用于判断位置)
:return: 是否为标题
"""
# 特征 1字体加粗
is_bold = "bold" in span["font"].lower()
# 特征 2字体大小相对较大
is_large_font = span["size"] > 14 # 适当降低阈值
# 特征 3文本位置靠近页面顶部或居中
is_top = span["origin"][1] < 100 # 距离页面顶部小于 100 像素
is_centered = abs(span["origin"][0] - page_width / 2) < 50 # 水平居中
# 特征 4文本格式包含大写字母或编号
text = span["text"].strip()
is_uppercase = text.isupper()
is_numbered = any(text.startswith(f"{i}.") for i in range(1, 10)) # 如 "1.", "2."
# 综合判断
# return is_bold or is_large_font
# flg = (is_bold or is_large_font) and (is_top or is_centered) and (is_uppercase or is_numbered)
flg = is_numbered
if flg :
print("标题:",text)
return flg
@classmethod
def is_top_fitz(cls, bbox_top: float) -> bool:
# print("----------------",bbox_top)
if bbox_top < 58:
return True
return False
@classmethod
def is_bottom_fitz(cls, bbox_bottom: float, page_height) -> bool:
# print("----------------",bbox_top)
if bbox_bottom > page_height - 60:
return True
return False
@classmethod
def is_centered_fitz(cls, origin_x: float, page_width: float) -> bool:
is_centered = abs(origin_x - page_width / 2) < 20 # 水平居中
return is_centered
@classmethod
def get_origin_bybbox(cls,bbox:list[float]) -> list[float]:
x = (bbox[0] + bbox[2]) / 2
y = (bbox[1] + bbox[3]) / 2
origin = list([x,y])
return origin
@classmethod
def get_first_span(cls,block:dict) -> dict | None:
if "lines" in block and len(block["lines"]) > 0:
lines = block["lines"]
line = lines[0]
return line["spans"][0]
return None
@classmethod
def is_bold_byspan(cls,span:dict) -> bool:
is_bold = "bold" in span["font"].lower()
return is_bold
@classmethod
def is_header_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
# 判断block
if block:
bbox = block["bbox"]
# number = block["number"]
origin = cls.get_origin_bybbox(bbox)
if "lines" in block and len(block["lines"]) == 1:
span = cls.get_first_span(block)
# 是否加粗
is_bold = cls.is_bold_byspan(span)
# 判断字体是否比较小
is_font_size = span["size"] < 10
# 不满行
is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width -120))
# 靠近页面顶部
is_top = cls.is_top_fitz(bbox_top = bbox[1]) # 距离页面顶部小于 100 像素
# 水平居中
is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
# 在判断字体是否加粗,或者字体大小,一般页眉页脚的字体比较小
return is_top or ( bbox[1] < 70 and not is_bold and is_font_size and is_centered and is_not_full_line )
return False
@classmethod
def is_footer_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
# 判断block
if block:
bbox = block["bbox"]
# number = block["number"]
origin = cls.get_origin_bybbox(bbox)
if "lines" in block :
span = cls.get_first_span(block)
# 是否加粗
is_bold = cls.is_bold_byspan(span)
# 判断字体是否比较小
is_font_size = span["size"] < 10
# 不满行
is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width -120))
# 靠近页面顶部
is_bottom = cls.is_bottom_fitz(bbox_bottom = bbox[3],page_height=page_height) # 距离页面顶部小于 100 像素
# 水平居中
is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
# 在判断字体是否加粗,或者字体大小,一般页眉页脚的字体比较小
return is_bottom or ( bbox[3] > page_height - 70 and not is_bold and is_font_size and is_centered and is_not_full_line )
return False
if __name__ == "__main__":
readPdfService = ReadPdfService()
content = readPdfService.load_content(pdf_file_path=r"D:\a.pdf")
# print(content)
print(content)
# PyPdfService.get_headline_page_dictionary(pdf_file_path=r"D:\a.pdf")