From 370a785d480bcda7efb88411c23f5a589305d7f8 Mon Sep 17 00:00:00 2001
From: weiheng <54829069+309299817@users.noreply.github.com>
Date: Wed, 30 Apr 2025 14:58:47 +0800
Subject: [PATCH] Update api/core/rag/extractor/pdf_extractor.py

Since page.extract_text() may return None when no text is found, consider adding a check before performing encoding operations to avoid potential AttributeError.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 api/core/rag/extractor/pdf_extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py
index 9c71216837..e2196066d7 100644
--- a/api/core/rag/extractor/pdf_extractor.py
+++ b/api/core/rag/extractor/pdf_extractor.py
@@ -69,7 +69,7 @@ class PdfExtractor(BaseExtractor):
             with pdfplumber.open(file_obj) as pdf:
                 for page_number, page in enumerate(pdf.pages):
                     # Extract text with layout preservation and encoding detection
-                    content = page.extract_text(layout=True)
+                    content = page.extract_text(layout=True) or ""
                     # Try to detect and fix encoding issues
                     try:
                         # First try to decode as UTF-8