|
|
|
|
@ -1,12 +1,20 @@
|
|
|
|
|
"""Abstract interface for document loader implementations."""
|
|
|
|
|
import datetime
|
|
|
|
|
import mimetypes
|
|
|
|
|
import os
|
|
|
|
|
import tempfile
|
|
|
|
|
import uuid
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
from docx import Document as DocxDocument
|
|
|
|
|
from flask import current_app
|
|
|
|
|
|
|
|
|
|
from core.rag.extractor.extractor_base import BaseExtractor
|
|
|
|
|
from core.rag.models.document import Document
|
|
|
|
|
from extensions.ext_database import db
|
|
|
|
|
from extensions.ext_storage import storage
|
|
|
|
|
from models.model import UploadFile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WordExtractor(BaseExtractor):
|
|
|
|
|
@ -17,9 +25,12 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
file_path: Path to the file to load.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, file_path: str):
|
|
|
|
|
def __init__(self, file_path: str, tenant_id: str, user_id: str):
|
|
|
|
|
"""Initialize with file path."""
|
|
|
|
|
self.file_path = file_path
|
|
|
|
|
self.tenant_id = tenant_id
|
|
|
|
|
self.user_id = user_id
|
|
|
|
|
|
|
|
|
|
if "~" in self.file_path:
|
|
|
|
|
self.file_path = os.path.expanduser(self.file_path)
|
|
|
|
|
|
|
|
|
|
@ -45,12 +56,7 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
|
|
|
|
|
def extract(self) -> list[Document]:
|
|
|
|
|
"""Load given path as single page."""
|
|
|
|
|
from docx import Document as docx_Document
|
|
|
|
|
|
|
|
|
|
document = docx_Document(self.file_path)
|
|
|
|
|
doc_texts = [paragraph.text for paragraph in document.paragraphs]
|
|
|
|
|
content = '\n'.join(doc_texts)
|
|
|
|
|
|
|
|
|
|
content = self.parse_docx(self.file_path, 'storage')
|
|
|
|
|
return [Document(
|
|
|
|
|
page_content=content,
|
|
|
|
|
metadata={"source": self.file_path},
|
|
|
|
|
@ -61,3 +67,111 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
"""Check if the url is valid."""
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
return bool(parsed.netloc) and bool(parsed.scheme)
|
|
|
|
|
|
|
|
|
|
def _extract_images_from_docx(self, doc, image_folder):
|
|
|
|
|
os.makedirs(image_folder, exist_ok=True)
|
|
|
|
|
image_count = 0
|
|
|
|
|
image_map = {}
|
|
|
|
|
|
|
|
|
|
for rel in doc.part.rels.values():
|
|
|
|
|
if "image" in rel.target_ref:
|
|
|
|
|
image_count += 1
|
|
|
|
|
image_ext = rel.target_ref.split('.')[-1]
|
|
|
|
|
# user uuid as file name
|
|
|
|
|
file_uuid = str(uuid.uuid4())
|
|
|
|
|
file_key = 'image_files/' + self.tenant_id + '/' + file_uuid + '.' + image_ext
|
|
|
|
|
mime_type, _ = mimetypes.guess_type(file_key)
|
|
|
|
|
|
|
|
|
|
storage.save(file_key, rel.target_part.blob)
|
|
|
|
|
# save file to db
|
|
|
|
|
config = current_app.config
|
|
|
|
|
upload_file = UploadFile(
|
|
|
|
|
tenant_id=self.tenant_id,
|
|
|
|
|
storage_type=config['STORAGE_TYPE'],
|
|
|
|
|
key=file_key,
|
|
|
|
|
name=file_key,
|
|
|
|
|
size=0,
|
|
|
|
|
extension=image_ext,
|
|
|
|
|
mime_type=mime_type,
|
|
|
|
|
created_by=self.user_id,
|
|
|
|
|
created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),
|
|
|
|
|
used=True,
|
|
|
|
|
used_by=self.user_id,
|
|
|
|
|
used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
db.session.add(upload_file)
|
|
|
|
|
db.session.commit()
|
|
|
|
|
image_map[rel.target_part] = f"}/files/{upload_file.id}/image-preview)"
|
|
|
|
|
|
|
|
|
|
return image_map
|
|
|
|
|
|
|
|
|
|
def _table_to_markdown(self, table):
|
|
|
|
|
markdown = ""
|
|
|
|
|
# deal with table headers
|
|
|
|
|
header_row = table.rows[0]
|
|
|
|
|
headers = [cell.text for cell in header_row.cells]
|
|
|
|
|
markdown += "| " + " | ".join(headers) + " |\n"
|
|
|
|
|
markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
|
|
|
# deal with table rows
|
|
|
|
|
for row in table.rows[1:]:
|
|
|
|
|
row_cells = [cell.text for cell in row.cells]
|
|
|
|
|
markdown += "| " + " | ".join(row_cells) + " |\n"
|
|
|
|
|
|
|
|
|
|
return markdown
|
|
|
|
|
|
|
|
|
|
def _parse_paragraph(self, paragraph, image_map):
|
|
|
|
|
paragraph_content = []
|
|
|
|
|
for run in paragraph.runs:
|
|
|
|
|
if run.element.xpath('.//a:blip'):
|
|
|
|
|
for blip in run.element.xpath('.//a:blip'):
|
|
|
|
|
embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
|
|
|
|
if embed_id:
|
|
|
|
|
rel_target = run.part.rels[embed_id].target_ref
|
|
|
|
|
if rel_target in image_map:
|
|
|
|
|
paragraph_content.append(image_map[rel_target])
|
|
|
|
|
if run.text.strip():
|
|
|
|
|
paragraph_content.append(run.text.strip())
|
|
|
|
|
return ' '.join(paragraph_content) if paragraph_content else ''
|
|
|
|
|
|
|
|
|
|
def parse_docx(self, docx_path, image_folder):
|
|
|
|
|
doc = DocxDocument(docx_path)
|
|
|
|
|
os.makedirs(image_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
content = []
|
|
|
|
|
|
|
|
|
|
image_map = self._extract_images_from_docx(doc, image_folder)
|
|
|
|
|
|
|
|
|
|
def parse_paragraph(paragraph):
|
|
|
|
|
paragraph_content = []
|
|
|
|
|
for run in paragraph.runs:
|
|
|
|
|
if run.element.tag.endswith('r'):
|
|
|
|
|
drawing_elements = run.element.findall(
|
|
|
|
|
'.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
|
|
|
|
|
for drawing in drawing_elements:
|
|
|
|
|
blip_elements = drawing.findall(
|
|
|
|
|
'.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip')
|
|
|
|
|
for blip in blip_elements:
|
|
|
|
|
embed_id = blip.get(
|
|
|
|
|
'{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
|
|
|
|
if embed_id:
|
|
|
|
|
image_part = doc.part.related_parts.get(embed_id)
|
|
|
|
|
if image_part in image_map:
|
|
|
|
|
paragraph_content.append(image_map[image_part])
|
|
|
|
|
if run.text.strip():
|
|
|
|
|
paragraph_content.append(run.text.strip())
|
|
|
|
|
return ''.join(paragraph_content) if paragraph_content else ''
|
|
|
|
|
|
|
|
|
|
paragraphs = doc.paragraphs.copy()
|
|
|
|
|
tables = doc.tables.copy()
|
|
|
|
|
for element in doc.element.body:
|
|
|
|
|
if element.tag.endswith('p'): # paragraph
|
|
|
|
|
para = paragraphs.pop(0)
|
|
|
|
|
parsed_paragraph = parse_paragraph(para)
|
|
|
|
|
if parsed_paragraph:
|
|
|
|
|
content.append(parsed_paragraph)
|
|
|
|
|
elif element.tag.endswith('tbl'): # table
|
|
|
|
|
table = tables.pop(0)
|
|
|
|
|
content.append(self._table_to_markdown(table))
|
|
|
|
|
return '\n'.join(content)
|
|
|
|
|
|
|
|
|
|
|