|
|
|
|
@ -30,11 +30,19 @@ class UnstructuredPPTXLoader(BaseLoader):
|
|
|
|
|
from unstructured.partition.pptx import partition_pptx
|
|
|
|
|
|
|
|
|
|
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
|
|
|
|
|
from unstructured.chunking.title import chunk_by_title
|
|
|
|
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
|
|
|
|
text_by_page = {}
|
|
|
|
|
for element in elements:
|
|
|
|
|
page = element.metadata.page_number
|
|
|
|
|
text = element.text
|
|
|
|
|
if page in text_by_page:
|
|
|
|
|
text_by_page[page] += "\n" + text
|
|
|
|
|
else:
|
|
|
|
|
text_by_page[page] = text
|
|
|
|
|
|
|
|
|
|
combined_texts = list(text_by_page.values())
|
|
|
|
|
documents = []
|
|
|
|
|
for chunk in chunks:
|
|
|
|
|
text = chunk.text.strip()
|
|
|
|
|
for combined_text in combined_texts:
|
|
|
|
|
text = combined_text.strip()
|
|
|
|
|
documents.append(Document(page_content=text))
|
|
|
|
|
|
|
|
|
|
return documents
|
|
|
|
|
|