From 17caa92181079d4411160da40ad406479e0321df Mon Sep 17 00:00:00 2001
From: zqgame <zqgame@zqgame.local>
Date: Wed, 7 May 2025 10:21:22 +0800
Subject: [PATCH] Fix: the pict type picture was not processed in the docx

---
 api/core/rag/extractor/word_extractor.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py
index 42b42173b9..ed021c6807 100644
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -239,7 +239,7 @@ class WordExtractor(BaseExtractor):
             paragraph_content = []
             for run in paragraph.runs:
                 if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
-                    # 处理drawing类型的图片
+                    # Process drawing type images
                     drawing_elements = run.element.findall(
                         ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
                     )
@@ -258,12 +258,12 @@ class WordExtractor(BaseExtractor):
                                     has_drawing = True
                                     paragraph_content.append(image_map[image_part])
                     
-                    # 处理pict类型的图片
+                    # Process pict type images
                     shape_elements = run.element.findall(
                         ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
                     )
                     for shape in shape_elements:
-                        # 查找VML中的图片数据
+                        # Find image data in VML
                         shape_image = shape.find(
                             ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
                         )
@@ -276,7 +276,7 @@ class WordExtractor(BaseExtractor):
                                 if image_part in image_map and not has_drawing:
                                     paragraph_content.append(image_map[image_part])
                         
-                        # 查找VML中的imagedata元素
+                        # Find imagedata element in VML
                         image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
                         if image_data is not None:
                             image_id = image_data.get("id") or image_data.get(