refactor: improve handling of leading punctuation removal (#10761)
parent
0ba17ec116
commit
14f3d44c37
@ -0,0 +1,16 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def remove_leading_symbols(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Remove leading punctuation or symbols from the given text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The text with leading punctuation or symbols removed.
|
||||||
|
"""
|
||||||
|
# Match Unicode ranges for punctuation and symbols
|
||||||
|
pattern = r"^[\u2000-\u206F\u2E00-\u2E7F\u3000-\u303F!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]+"
|
||||||
|
return re.sub(pattern, "", text)
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.tools.utils.text_processing_utils import remove_leading_symbols
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("input_text", "expected_output"),
|
||||||
|
[
|
||||||
|
("...Hello, World!", "Hello, World!"),
|
||||||
|
("。测试中文标点", "测试中文标点"),
|
||||||
|
("!@#Test symbols", "Test symbols"),
|
||||||
|
("Hello, World!", "Hello, World!"),
|
||||||
|
("", ""),
|
||||||
|
(" ", " "),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_remove_leading_symbols(input_text, expected_output):
|
||||||
|
assert remove_leading_symbols(input_text) == expected_output
|
||||||
Loading…
Reference in New Issue