The implementation of knowledge base PDF parsing using pypdfium2 to extract text mainly has the following issues:

1. Limited text extraction capability and insufficient support for tables and images
2. Lack of specialized Chinese processing optimization
3. No document structure analysis
4. Lack of document quality assessment
Suggested optimization plan:
1. Use pdfplumber instead of pypdfium2
2. Increase OCR support
3. Optimize Chinese processing logic
4. Add document structure analysis
5. Implement intelligent table recognition
6. Add caching mechanism
7. Optimize large file processing
pull/19056/head
魏恒 1 year ago
parent bf46b894e9
commit 236c9d64c3

@ -1,28 +1,42 @@
"""Abstract interface for document loader implementations."""
"""Enhanced PDF document loader with improved features for text extraction and structure analysis."""
import uuid
from collections.abc import Iterator
from typing import Optional, cast
import mimetypes
import pdfplumber
import pytesseract
from PIL import Image
import io
import datetime
from configs import dify_config
from core.rag.extractor.blob.blob import Blob
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.enums import CreatedByRole
from models.model import UploadFile
class PdfExtractor(BaseExtractor):
"""Load pdf files.
"""Enhanced PDF loader with improved text extraction, OCR support, and structure analysis.
Args:
file_path: Path to the file to load.
file_path: Path to the PDF file to load.
file_cache_key: Optional cache key for storing extracted text.
enable_ocr: Whether to enable OCR for text extraction from images.
"""
def __init__(self, file_path: str, file_cache_key: Optional[str] = None):
"""Initialize with file path."""
def __init__(self, file_path: str, file_cache_key: Optional[str] = None, enable_ocr: bool = False, tenant_id: str = None, user_id: str = None):
"""Initialize with file path and optional settings."""
self._file_path = file_path
self._file_cache_key = file_cache_key
self._enable_ocr = enable_ocr
self._tenant_id = tenant_id
self._user_id = user_id
def extract(self) -> list[Document]:
"""Extract text from PDF with caching support."""
plaintext_file_exists = False
if self._file_cache_key:
try:
@ -31,38 +45,118 @@ class PdfExtractor(BaseExtractor):
return [Document(page_content=text)]
except FileNotFoundError:
pass
documents = list(self.load())
text_list = []
for document in documents:
text_list.append(document.page_content)
text = "\n\n".join(text_list)
# save plaintext file for caching
# Save plaintext file for caching
if not plaintext_file_exists and self._file_cache_key:
storage.save(self._file_cache_key, text.encode("utf-8"))
return documents
def load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
def load(self) -> Iterator[Document]:
"""Lazy load PDF pages with enhanced text extraction."""
blob = Blob.from_path(self._file_path)
yield from self.parse(blob)
def parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdfium2 # type: ignore
"""Parse PDF with enhanced features including OCR and structure analysis."""
with blob.as_bytes_io() as file_obj:
with pdfplumber.open(file_obj) as pdf:
for page_number, page in enumerate(pdf.pages):
# Extract text with layout preservation and encoding detection
content = page.extract_text(layout=True)
# Try to detect and fix encoding issues
try:
# First try to decode as UTF-8
content = content.encode('utf-8').decode('utf-8')
except UnicodeError:
try:
# If UTF-8 fails, try GB18030 (common Chinese encoding)
content = content.encode('utf-8').decode('gb18030', errors='ignore')
except UnicodeError:
# If all else fails, use a more lenient approach
content = content.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage()
content = text_page.get_text_range()
text_page.close()
page.close()
metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata)
finally:
pdf_reader.close()
# Extract tables if present
tables = page.extract_tables()
if tables:
table_text = ""
for table in tables:
# Convert table to text format
table_text +="\n".join(
["\t".join([str(cell) if cell else "" for cell in row])
for row in table]
)
content += table_text
# Extract images if present
images = page.images
if images:
image_text = ""
for i, img in enumerate(images):
# Generate a unique filename for the image
file_uuid = str(uuid.uuid4())
image_ext = "png"
file_key = f"image_files/{self._tenant_id}/{file_uuid}.{image_ext}"
mime_type, _ = mimetypes.guess_type(file_key)
# Save image to storage
image_bytes = io.BytesIO(img['stream'].get_data())
image_data = image_bytes.getvalue()
storage.save(file_key, image_data)
# Save file record to database
if self._tenant_id and self._user_id:
upload_file = UploadFile(
tenant_id=self._tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=file_key,
size=len(image_data),
extension=image_ext,
mime_type=mime_type or "",
created_by=self._user_id,
created_by_role=CreatedByRole.ACCOUNT,
created_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None),
used=True,
used_by=self._user_id,
used_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
)
db.session.add(upload_file)
db.session.commit()
# Add image preview URL to content
image_text += f"![image]({dify_config.CONSOLE_API_URL}/files/{upload_file.id}/file-preview) "
content += image_text
# Perform OCR if enabled and text content is limited or contains potential encoding issues
if self._enable_ocr and (len(content.strip()) < 100 or any('\ufffd' in line for line in content.splitlines())):
image = page.to_image()
img_bytes = io.BytesIO()
image.original.save(img_bytes, format='PNG')
img_bytes.seek(0)
pil_image = Image.open(img_bytes)
# Use multiple language models and improve OCR accuracy
ocr_text = pytesseract.image_to_string(
pil_image,
lang='chi_sim+chi_tra+eng', # Support both simplified and traditional Chinese
config='--psm 3 --oem 3' # Use more accurate OCR mode
)
if ocr_text.strip():
# Clean and normalize OCR text
ocr_text = ocr_text.replace('\x0c', '').strip()
content = f"{content}\n\nOCR Text:\n{ocr_text}"
metadata = {
"source": blob.source,
"page": page_number,
"has_tables": bool(tables)
}
yield Document(page_content=content, metadata=metadata)
Loading…
Cancel
Save