From 370a785d480bcda7efb88411c23f5a589305d7f8 Mon Sep 17 00:00:00 2001 From: weiheng <54829069+309299817@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:58:47 +0800 Subject: [PATCH] Update api/core/rag/extractor/pdf_extractor.py Since page.extract_text() may return None when no text is found, consider adding a check before performing encoding operations to avoid potential AttributeError. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/core/rag/extractor/pdf_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 9c71216837..e2196066d7 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -69,7 +69,7 @@ class PdfExtractor(BaseExtractor): with pdfplumber.open(file_obj) as pdf: for page_number, page in enumerate(pdf.pages): # Extract text with layout preservation and encoding detection - content = page.extract_text(layout=True) + content = page.extract_text(layout=True) or "" # Try to detect and fix encoding issues try: # First try to decode as UTF-8