|
|
class ReadPdfService:
|
|
|
|
|
|
@classmethod
|
|
|
def load_content(cls, pdf_file_path: str) -> str | None:
|
|
|
doc = None
|
|
|
try:
|
|
|
# PDF标题提取(需要pymupdf库)
|
|
|
import fitz
|
|
|
doc = fitz.open(pdf_file_path)
|
|
|
contents = []
|
|
|
for page in doc:
|
|
|
page_height = page.rect.height
|
|
|
page_width = page.rect.width
|
|
|
# 旧版本中 Page 对象可能没有 get_text 方法,使用 getText 方法替代
|
|
|
# 从 v1.21.0 版本开始,fitz.Page 类的 getText 方法已被弃用,应使用 get_text 方法
|
|
|
blocks = page.get_text("dict")["blocks"] # type: ignore
|
|
|
page_number = page.number
|
|
|
# if page_number == 39:
|
|
|
# print("-----------------------------")
|
|
|
blocks = cls.handle_blocks(blocks)
|
|
|
for block in blocks:
|
|
|
# if page_number == 39:
|
|
|
# print("-----------------------------")
|
|
|
# print(block)
|
|
|
type = block["type"]
|
|
|
if type == 1:
|
|
|
continue
|
|
|
content = cls.get_block_content(block,page_width,page_height)
|
|
|
if content is not None:
|
|
|
# if "图5-5" in content:
|
|
|
# # print(content)
|
|
|
# print("aaaaaaaaa")
|
|
|
contents.append(content)
|
|
|
return "\n".join(contents)
|
|
|
finally:
|
|
|
if doc is not None:
|
|
|
doc.close()
|
|
|
|
|
|
@classmethod
|
|
|
def handle_blocks(cls, blocks: list[dict]) -> list[dict] | None:
|
|
|
if blocks is not None:
|
|
|
handle_block_list = cls.sort_blocks(blocks)
|
|
|
handle_block_list = cls.filter_inner_img_block(handle_block_list)
|
|
|
return handle_block_list
|
|
|
return blocks
|
|
|
|
|
|
@classmethod
|
|
|
def sort_blocks(cls, blocks: list[dict]) -> list[dict] | None:
|
|
|
if blocks is not None:
|
|
|
def custom_sorted(block:dict) -> float:
|
|
|
bbox = block["bbox"]
|
|
|
return bbox[1]
|
|
|
sorted_data_asc = sorted(blocks, key=custom_sorted)
|
|
|
return sorted_data_asc
|
|
|
return blocks
|
|
|
|
|
|
@classmethod
|
|
|
def handle_lines(cls, blocks: list[dict]) -> list[dict] | None:
|
|
|
if blocks is not None:
|
|
|
|
|
|
def top_sorted(block:dict) -> float:
|
|
|
bbox = block["bbox"]
|
|
|
return bbox[1]
|
|
|
|
|
|
def left_sorted(block:dict) -> float:
|
|
|
bbox = block["bbox"]
|
|
|
return bbox[0]
|
|
|
sorted_data_asc = sorted(blocks, key=top_sorted)
|
|
|
sorted_data_asc = sorted(sorted_data_asc, key=left_sorted)
|
|
|
|
|
|
return sorted_data_asc
|
|
|
return blocks
|
|
|
|
|
|
@classmethod
|
|
|
def is_inner_img_block(cls, block: dict, img_bboxs: list[dict]) -> bool:
|
|
|
is_inner_img = False
|
|
|
type = block["type"]
|
|
|
if type != 1:
|
|
|
for img_bbox in img_bboxs:
|
|
|
if not is_inner_img:
|
|
|
bbox = block["bbox"]
|
|
|
is_inner_ = (bbox[0] >= img_bbox[0]
|
|
|
and bbox[1] >= img_bbox[1]
|
|
|
and bbox[2] <= img_bbox[2]
|
|
|
and bbox[3] <= img_bbox[3])
|
|
|
if is_inner_:
|
|
|
is_inner_img = True
|
|
|
return is_inner_img
|
|
|
|
|
|
@classmethod
|
|
|
def get_only_row_img_bboxs(cls, blocks: list[dict],img_bboxs: list[dict]) -> list[dict]:
|
|
|
# 判断图片是否是单独一行,单独一行的图片,内部的文字正常处理,反之,不处理(如果两个图片并列的话,内部的文字也是正常处理)
|
|
|
only_row_img_bboxs = []
|
|
|
for img_bbox in img_bboxs:
|
|
|
# 同层级是否只有图片
|
|
|
is_only_img = True
|
|
|
for block in blocks:
|
|
|
type = block["type"]
|
|
|
if type != 1:
|
|
|
bbox = block["bbox"]
|
|
|
# 判断当前是否是不在图片内的文本
|
|
|
is_inner_img = cls.is_inner_img_block(block, img_bboxs)
|
|
|
# 判断当前是否在同层级
|
|
|
is_save_level = (
|
|
|
(
|
|
|
bbox[0] > img_bbox[2]
|
|
|
or bbox[2] < img_bbox[0]
|
|
|
)
|
|
|
and bbox[1] >= img_bbox[1]
|
|
|
and bbox[3] <= img_bbox[3]
|
|
|
)
|
|
|
if not is_inner_img and is_save_level:
|
|
|
is_only_img = False
|
|
|
if is_only_img:
|
|
|
only_row_img_bboxs.append(img_bbox)
|
|
|
return only_row_img_bboxs
|
|
|
|
|
|
@classmethod
|
|
|
def filter_inner_img_block(cls, blocks: list[dict]) -> list[dict] | None:
|
|
|
if blocks is not None:
|
|
|
# 判断是否有图片
|
|
|
img_bboxs:list[dict] = []
|
|
|
for block in blocks:
|
|
|
bbox = block["bbox"]
|
|
|
type = block["type"]
|
|
|
if type == 1:
|
|
|
img_bboxs.append(bbox)
|
|
|
if len(img_bboxs) > 0:
|
|
|
# 获取所有单独在一行的图片区域集合
|
|
|
only_row_img_bboxs = cls.get_only_row_img_bboxs(blocks, img_bboxs)
|
|
|
filter_blocks: list[dict] = []
|
|
|
for block in blocks:
|
|
|
type = block["type"]
|
|
|
if type != 1:
|
|
|
# 判断是否在单行都是图片的区域
|
|
|
is_only_row_img = cls.is_inner_img_block(block, only_row_img_bboxs)
|
|
|
# 判断是否在图片区域内
|
|
|
is_inner_img = cls.is_inner_img_block(block, img_bboxs)
|
|
|
# 如果在单行都是图片的区域内,返回值。或者不在图片区域内,返回值。
|
|
|
if is_only_row_img or not is_inner_img:
|
|
|
filter_blocks.append(block)
|
|
|
# else:
|
|
|
# content_ = cls.load_block_content(block)
|
|
|
# print(content_)
|
|
|
return filter_blocks
|
|
|
return blocks
|
|
|
|
|
|
@classmethod
|
|
|
def get_block_content(cls, block: dict, page_width: float, page_height : float) -> str | None:
|
|
|
|
|
|
header = cls.is_header_fitz(block=block, page_width=page_width, page_height=page_height)
|
|
|
footer = cls.is_footer_fitz(block=block, page_width=page_width, page_height=page_height)
|
|
|
|
|
|
if not header and not footer:
|
|
|
return cls.load_block_content(block=block)
|
|
|
return None
|
|
|
|
|
|
@classmethod
|
|
|
def load_block_content(cls, block: dict) -> str | None:
|
|
|
if "lines" in block:
|
|
|
line_texts = []
|
|
|
lines = cls.handle_lines(block["lines"])
|
|
|
for line in lines:
|
|
|
texts = []
|
|
|
for span in line["spans"]:
|
|
|
text = span["text"].strip()
|
|
|
if text:
|
|
|
texts.append(text)
|
|
|
if len(texts) > 0:
|
|
|
line_text = "".join(texts)
|
|
|
line_texts.append(line_text)
|
|
|
if len(line_texts) > 0:
|
|
|
# print("************************************",len(line_texts))
|
|
|
# print("\n".join(line_texts))
|
|
|
return "\n".join(line_texts)
|
|
|
return None
|
|
|
|
|
|
@classmethod
|
|
|
def is_heading_fitz(cls, span: dict, page_width: float) -> bool:
|
|
|
"""
|
|
|
判断一个文本片段是否为标题
|
|
|
:param span: PyMuPDF 返回的文本片段信息
|
|
|
:param page_width: 页面宽度(用于判断位置)
|
|
|
:return: 是否为标题
|
|
|
"""
|
|
|
# 特征 1:字体加粗
|
|
|
is_bold = "bold" in span["font"].lower()
|
|
|
|
|
|
# 特征 2:字体大小(相对较大)
|
|
|
is_large_font = span["size"] > 14 # 适当降低阈值
|
|
|
|
|
|
# 特征 3:文本位置(靠近页面顶部或居中)
|
|
|
is_top = span["origin"][1] < 100 # 距离页面顶部小于 100 像素
|
|
|
is_centered = abs(span["origin"][0] - page_width / 2) < 50 # 水平居中
|
|
|
|
|
|
# 特征 4:文本格式(包含大写字母或编号)
|
|
|
text = span["text"].strip()
|
|
|
is_uppercase = text.isupper()
|
|
|
is_numbered = any(text.startswith(f"{i}.") for i in range(1, 10)) # 如 "1.", "2."
|
|
|
|
|
|
# 综合判断
|
|
|
# return is_bold or is_large_font
|
|
|
# flg = (is_bold or is_large_font) and (is_top or is_centered) and (is_uppercase or is_numbered)
|
|
|
flg = is_numbered
|
|
|
if flg :
|
|
|
print("标题:",text)
|
|
|
return flg
|
|
|
|
|
|
@classmethod
|
|
|
def is_top_fitz(cls, bbox_top: float) -> bool:
|
|
|
# print("----------------",bbox_top)
|
|
|
if bbox_top < 58:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
@classmethod
|
|
|
def is_bottom_fitz(cls, bbox_bottom: float, page_height) -> bool:
|
|
|
# print("----------------",bbox_top)
|
|
|
if bbox_bottom > page_height - 60:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
@classmethod
|
|
|
def is_centered_fitz(cls, origin_x: float, page_width: float) -> bool:
|
|
|
is_centered = abs(origin_x - page_width / 2) < 20 # 水平居中
|
|
|
return is_centered
|
|
|
|
|
|
@classmethod
|
|
|
def get_origin_bybbox(cls,bbox:list[float]) -> list[float]:
|
|
|
x = (bbox[0] + bbox[2]) / 2
|
|
|
y = (bbox[1] + bbox[3]) / 2
|
|
|
origin = list([x,y])
|
|
|
return origin
|
|
|
|
|
|
@classmethod
|
|
|
def get_first_span(cls,block:dict) -> dict | None:
|
|
|
if "lines" in block and len(block["lines"]) > 0:
|
|
|
lines = block["lines"]
|
|
|
line = lines[0]
|
|
|
return line["spans"][0]
|
|
|
return None
|
|
|
|
|
|
@classmethod
|
|
|
def is_bold_byspan(cls,span:dict) -> bool:
|
|
|
is_bold = "bold" in span["font"].lower()
|
|
|
return is_bold
|
|
|
|
|
|
@classmethod
|
|
|
def is_header_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
|
|
|
# 判断block
|
|
|
if block:
|
|
|
bbox = block["bbox"]
|
|
|
# number = block["number"]
|
|
|
origin = cls.get_origin_bybbox(bbox)
|
|
|
if "lines" in block and len(block["lines"]) == 1:
|
|
|
span = cls.get_first_span(block)
|
|
|
# 是否加粗
|
|
|
is_bold = cls.is_bold_byspan(span)
|
|
|
# 判断字体是否比较小
|
|
|
is_font_size = span["size"] < 10
|
|
|
# 不满行
|
|
|
is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width -120))
|
|
|
# 靠近页面顶部
|
|
|
is_top = cls.is_top_fitz(bbox_top = bbox[1]) # 距离页面顶部小于 100 像素
|
|
|
# 水平居中
|
|
|
is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
|
|
|
# 在判断字体是否加粗,或者字体大小,一般页眉页脚的字体比较小
|
|
|
return is_top or ( bbox[1] < 70 and not is_bold and is_font_size and is_centered and is_not_full_line )
|
|
|
return False
|
|
|
|
|
|
@classmethod
|
|
|
def is_footer_fitz(cls,block: dict,page_width: float, page_height: float) -> bool:
|
|
|
# 判断block
|
|
|
if block:
|
|
|
bbox = block["bbox"]
|
|
|
# number = block["number"]
|
|
|
origin = cls.get_origin_bybbox(bbox)
|
|
|
if "lines" in block :
|
|
|
span = cls.get_first_span(block)
|
|
|
# 是否加粗
|
|
|
is_bold = cls.is_bold_byspan(span)
|
|
|
# 判断字体是否比较小
|
|
|
is_font_size = span["size"] < 10
|
|
|
# 不满行
|
|
|
is_not_full_line = bbox[0] > 120 or (bbox[3] < (page_width -120))
|
|
|
# 靠近页面顶部
|
|
|
is_bottom = cls.is_bottom_fitz(bbox_bottom = bbox[3],page_height=page_height) # 距离页面顶部小于 100 像素
|
|
|
# 水平居中
|
|
|
is_centered = cls.is_centered_fitz(origin_x = origin[0], page_width=page_width)
|
|
|
# 在判断字体是否加粗,或者字体大小,一般页眉页脚的字体比较小
|
|
|
return is_bottom or ( bbox[3] > page_height - 70 and not is_bold and is_font_size and is_centered and is_not_full_line )
|
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
readPdfService = ReadPdfService()
|
|
|
content = readPdfService.load_content(pdf_file_path=r"D:\a.pdf")
|
|
|
# print(content)
|
|
|
print(content)
|
|
|
# PyPdfService.get_headline_page_dictionary(pdf_file_path=r"D:\a.pdf") |