From 236c9d64c363abd1da0f5068ac82697ca1419b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AD=8F=E6=81=92?= Date: Tue, 29 Apr 2025 11:20:38 +0800 Subject: [PATCH 1/4] The implementation of knowledge base PDF parsing using pypdfium2 to extract text mainly has the following issues: 1. Limited text extraction capability and insufficient support for tables and images 2. Lack of specialized Chinese processing optimization 3. No document structure analysis 4. Lack of document quality assessment Suggested optimization plan: 1. Use pdfplumber instead of pypdfium2 2. Increase OCR support 3. Optimize Chinese processing logic 4. Add document structure analysis 5. Implement intelligent table recognition 6. Add caching mechanism 7. Optimize large file processing --- api/core/rag/extractor/pdf_extractor.py | 148 +++++++++++++++++++----- 1 file changed, 121 insertions(+), 27 deletions(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 04033dec3f..9c71216837 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -1,28 +1,42 @@ -"""Abstract interface for document loader implementations.""" - +"""Enhanced PDF document loader with improved features for text extraction and structure analysis.""" +import uuid from collections.abc import Iterator from typing import Optional, cast +import mimetypes +import pdfplumber +import pytesseract +from PIL import Image +import io +import datetime +from configs import dify_config from core.rag.extractor.blob.blob import Blob from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document +from extensions.ext_database import db from extensions.ext_storage import storage - +from models.enums import CreatedByRole +from models.model import UploadFile class PdfExtractor(BaseExtractor): - """Load pdf files. - + """Enhanced PDF loader with improved text extraction, OCR support, and structure analysis. Args: - file_path: Path to the file to load. + file_path: Path to the PDF file to load. + file_cache_key: Optional cache key for storing extracted text. + enable_ocr: Whether to enable OCR for text extraction from images. """ - def __init__(self, file_path: str, file_cache_key: Optional[str] = None): - """Initialize with file path.""" + def __init__(self, file_path: str, file_cache_key: Optional[str] = None, enable_ocr: bool = False, tenant_id: str = None, user_id: str = None): + """Initialize with file path and optional settings.""" self._file_path = file_path self._file_cache_key = file_cache_key + self._enable_ocr = enable_ocr + self._tenant_id = tenant_id + self._user_id = user_id def extract(self) -> list[Document]: + """Extract text from PDF with caching support.""" plaintext_file_exists = False if self._file_cache_key: try: @@ -31,38 +45,118 @@ class PdfExtractor(BaseExtractor): return [Document(page_content=text)] except FileNotFoundError: pass + documents = list(self.load()) text_list = [] for document in documents: text_list.append(document.page_content) text = "\n\n".join(text_list) - # save plaintext file for caching + # Save plaintext file for caching if not plaintext_file_exists and self._file_cache_key: storage.save(self._file_cache_key, text.encode("utf-8")) return documents - def load( - self, - ) -> Iterator[Document]: - """Lazy load given path as pages.""" + def load(self) -> Iterator[Document]: + """Lazy load PDF pages with enhanced text extraction.""" blob = Blob.from_path(self._file_path) yield from self.parse(blob) def parse(self, blob: Blob) -> Iterator[Document]: - """Lazily parse the blob.""" - import pypdfium2 # type: ignore + """Parse PDF with enhanced features including OCR and structure analysis.""" + with blob.as_bytes_io() as file_obj: + with pdfplumber.open(file_obj) as pdf: + for page_number, page in enumerate(pdf.pages): + # Extract text with layout preservation and encoding detection + content = page.extract_text(layout=True) + # Try to detect and fix encoding issues + try: + # First try to decode as UTF-8 + content = content.encode('utf-8').decode('utf-8') + except UnicodeError: + try: + # If UTF-8 fails, try GB18030 (common Chinese encoding) + content = content.encode('utf-8').decode('gb18030', errors='ignore') + except UnicodeError: + # If all else fails, use a more lenient approach + content = content.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore') - with blob.as_bytes_io() as file_path: - pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True) - try: - for page_number, page in enumerate(pdf_reader): - text_page = page.get_textpage() - content = text_page.get_text_range() - text_page.close() - page.close() - metadata = {"source": blob.source, "page": page_number} - yield Document(page_content=content, metadata=metadata) - finally: - pdf_reader.close() + # Extract tables if present + tables = page.extract_tables() + if tables: + table_text = "" + for table in tables: + # Convert table to text format + table_text +="\n".join( + ["\t".join([str(cell) if cell else "" for cell in row]) + for row in table] + ) + content += table_text + + # Extract images if present + images = page.images + if images: + image_text = "" + for i, img in enumerate(images): + # Generate a unique filename for the image + file_uuid = str(uuid.uuid4()) + image_ext = "png" + file_key = f"image_files/{self._tenant_id}/{file_uuid}.{image_ext}" + mime_type, _ = mimetypes.guess_type(file_key) + + # Save image to storage + image_bytes = io.BytesIO(img['stream'].get_data()) + image_data = image_bytes.getvalue() + storage.save(file_key, image_data) + + # Save file record to database + if self._tenant_id and self._user_id: + upload_file = UploadFile( + tenant_id=self._tenant_id, + storage_type=dify_config.STORAGE_TYPE, + key=file_key, + name=file_key, + size=len(image_data), + extension=image_ext, + mime_type=mime_type or "", + created_by=self._user_id, + created_by_role=CreatedByRole.ACCOUNT, + created_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None), + used=True, + used_by=self._user_id, + used_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + ) + + db.session.add(upload_file) + db.session.commit() + + # Add image preview URL to content + image_text += f"![image]({dify_config.CONSOLE_API_URL}/files/{upload_file.id}/file-preview) " + content += image_text + + # Perform OCR if enabled and text content is limited or contains potential encoding issues + if self._enable_ocr and (len(content.strip()) < 100 or any('\ufffd' in line for line in content.splitlines())): + image = page.to_image() + img_bytes = io.BytesIO() + image.original.save(img_bytes, format='PNG') + img_bytes.seek(0) + pil_image = Image.open(img_bytes) + # Use multiple language models and improve OCR accuracy + ocr_text = pytesseract.image_to_string( + pil_image, + lang='chi_sim+chi_tra+eng', # Support both simplified and traditional Chinese + config='--psm 3 --oem 3' # Use more accurate OCR mode + ) + if ocr_text.strip(): + # Clean and normalize OCR text + ocr_text = ocr_text.replace('\x0c', '').strip() + content = f"{content}\n\nOCR Text:\n{ocr_text}" + + metadata = { + "source": blob.source, + "page": page_number, + "has_tables": bool(tables) + } + + yield Document(page_content=content, metadata=metadata) \ No newline at end of file From 370a785d480bcda7efb88411c23f5a589305d7f8 Mon Sep 17 00:00:00 2001 From: weiheng <54829069+309299817@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:58:47 +0800 Subject: [PATCH 2/4] Update api/core/rag/extractor/pdf_extractor.py Since page.extract_text() may return None when no text is found, consider adding a check before performing encoding operations to avoid potential AttributeError. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/core/rag/extractor/pdf_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 9c71216837..e2196066d7 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -69,7 +69,7 @@ class PdfExtractor(BaseExtractor): with pdfplumber.open(file_obj) as pdf: for page_number, page in enumerate(pdf.pages): # Extract text with layout preservation and encoding detection - content = page.extract_text(layout=True) + content = page.extract_text(layout=True) or "" # Try to detect and fix encoding issues try: # First try to decode as UTF-8 From 0f05a32e2c2feb9477135e10768f884fc4d740ec Mon Sep 17 00:00:00 2001 From: weiheng <54829069+309299817@users.noreply.github.com> Date: Wed, 30 Apr 2025 15:00:47 +0800 Subject: [PATCH 3/4] Update api/core/rag/extractor/pdf_extractor.py The use of 'datetime.UTC' may lead to errors since the standard library typically uses 'datetime.timezone.utc'. Consider updating for compatibility. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/core/rag/extractor/pdf_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index e2196066d7..0b519fabc1 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -122,10 +122,10 @@ class PdfExtractor(BaseExtractor): mime_type=mime_type or "", created_by=self._user_id, created_by_role=CreatedByRole.ACCOUNT, - created_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None), + created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None), used=True, used_by=self._user_id, - used_at=datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) ) db.session.add(upload_file) From ca7ffb186e0202f9a2ffb5ae57a4773e081f5e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AD=8F=E6=81=92?= Date: Thu, 29 May 2025 09:15:27 +0800 Subject: [PATCH 4/4] Upgrade 1.4.1, change the CreatorUserRole class of models.enum --- api/core/rag/extractor/pdf_extractor.py | 6 +++--- api/pyproject.toml | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 0b519fabc1..c4a8da58a3 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -15,7 +15,7 @@ from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from extensions.ext_database import db from extensions.ext_storage import storage -from models.enums import CreatedByRole +from models.enums import CreatorUserRole from models.model import UploadFile class PdfExtractor(BaseExtractor): @@ -121,7 +121,7 @@ class PdfExtractor(BaseExtractor): extension=image_ext, mime_type=mime_type or "", created_by=self._user_id, - created_by_role=CreatedByRole.ACCOUNT, + created_by_role=CreatorUserRole.ACCOUNT, created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None), used=True, used_by=self._user_id, @@ -159,4 +159,4 @@ class PdfExtractor(BaseExtractor): "has_tables": bool(tables) } - yield Document(page_content=content, metadata=metadata) \ No newline at end of file + yield Document(page_content=content, metadata=metadata) diff --git a/api/pyproject.toml b/api/pyproject.toml index 1c6adb6587..0cb91fec69 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -83,6 +83,8 @@ dependencies = [ "weave~=0.51.0", "yarl~=1.18.3", "webvtt-py~=0.5.1", + "pdfplumber~=0.11.6", + "pytesseract~=0.3.13" ] # Before adding new dependency, consider place it in # alphabet order (a-z) and suitable group.