|
|
|
|
@ -1,9 +1,12 @@
|
|
|
|
|
"""Abstract interface for document loader implementations."""
|
|
|
|
|
import datetime
|
|
|
|
|
import logging
|
|
|
|
|
import mimetypes
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import tempfile
|
|
|
|
|
import uuid
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
@ -16,6 +19,7 @@ from extensions.ext_database import db
|
|
|
|
|
from extensions.ext_storage import storage
|
|
|
|
|
from models.model import UploadFile
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
class WordExtractor(BaseExtractor):
|
|
|
|
|
"""Load docx files.
|
|
|
|
|
@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
|
|
|
|
|
image_map = self._extract_images_from_docx(doc, image_folder)
|
|
|
|
|
|
|
|
|
|
hyperlinks_url = None
|
|
|
|
|
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
|
|
|
|
|
for para in doc.paragraphs:
|
|
|
|
|
for run in para.runs:
|
|
|
|
|
if run.text and hyperlinks_url:
|
|
|
|
|
result = f' [{run.text}]({hyperlinks_url}) '
|
|
|
|
|
run.text = result
|
|
|
|
|
hyperlinks_url = None
|
|
|
|
|
if 'HYPERLINK' in run.element.xml:
|
|
|
|
|
try:
|
|
|
|
|
xml = ET.XML(run.element.xml)
|
|
|
|
|
x_child = [c for c in xml.iter() if c is not None]
|
|
|
|
|
for x in x_child:
|
|
|
|
|
if x_child is None:
|
|
|
|
|
continue
|
|
|
|
|
if x.tag.endswith('instrText'):
|
|
|
|
|
for i in url_pattern.findall(x.text):
|
|
|
|
|
hyperlinks_url = str(i)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_paragraph(paragraph):
|
|
|
|
|
paragraph_content = []
|
|
|
|
|
for run in paragraph.runs:
|
|
|
|
|
|