fix: markdown_extractor lost chunks if it starts without a header

11 months ago · 176cc2759f
parent ef20f694b2
commit 176cc2759f
2 changed files with 25 additions and 10 deletions
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
                continue
            header_match = re.match(r"^#+\s", line)
            if header_match:
                if current_header is not None:
                markdown_tups.append((current_header, current_text))
                current_header = line
                current_text = ""
            else:
                current_text += line + "\n"
        markdown_tups.append((current_header, current_text))
        if current_header is not None:
            # pass linting, assert keys are defined
        markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
+            (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
            for key, value in markdown_tups
        ]
        else:
            markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
        return markdown_tups
--- a/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
@ -0,0 +1,20 @@
 from core.rag.extractor.markdown_extractor import MarkdownExtractor
 def test_markdown_to_tups():
    markdown = """
 this is some text without header
 # title 1
 this is balabala text
 ## title 2
 this is more specific text.
        """
    extractor = MarkdownExtractor(file_path="dummy_path")
    updated_output = extractor.markdown_to_tups(markdown)
    assert len(updated_output) == 3
    key, _ = updated_output[0]
    _, value = updated_output[1]
    assert key == None
    assert value.strip() == 'this is balabala text'