fix: markdown_extractor lost chunks if it starts without a header

pull/21309/head
koevas1226 11 months ago
parent ef20f694b2
commit 176cc2759f

@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
continue continue
header_match = re.match(r"^#+\s", line) header_match = re.match(r"^#+\s", line)
if header_match: if header_match:
if current_header is not None:
markdown_tups.append((current_header, current_text)) markdown_tups.append((current_header, current_text))
current_header = line current_header = line
current_text = "" current_text = ""
else: else:
current_text += line + "\n" current_text += line + "\n"
markdown_tups.append((current_header, current_text)) markdown_tups.append((current_header, current_text))
if current_header is not None:
# pass linting, assert keys are defined
markdown_tups = [ markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
] ]
else:
markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
return markdown_tups return markdown_tups

@ -0,0 +1,20 @@
from core.rag.extractor.markdown_extractor import MarkdownExtractor
def test_markdown_to_tups():
markdown = """
this is some text without header
# title 1
this is balabala text
## title 2
this is more specific text.
"""
extractor = MarkdownExtractor(file_path="dummy_path")
updated_output = extractor.markdown_to_tups(markdown)
assert len(updated_output) == 3
key, _ = updated_output[0]
_, value = updated_output[1]
assert key == None
assert value.strip() == 'this is balabala text'
Loading…
Cancel
Save