diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py
index d0a4aea5ab..cbd06fc186 100644
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -1,19 +1,13 @@
-import hashlib
 import mimetypes
-import os
 import re
-import site
-import unicodedata
-from contextlib import contextmanager
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Literal, Optional, cast
+from typing import Any, Optional, cast
 from urllib.parse import unquote
 
 import chardet
 import cloudscraper  # type: ignore
-from bs4 import BeautifulSoup, CData, Comment, NavigableString  # type: ignore
-from readability import Document  # type: ignore
-from regex import regex  # type: ignore
+from readabilipy import simple_json_from_html_string  # type: ignore
 
 from core.helper import ssrf_proxy
 from core.rag.extractor import extract_processor
@@ -69,8 +63,8 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
         response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
     elif response.status_code == 403:
         scraper = cloudscraper.create_scraper()
-        scraper.perform_request = ssrf_proxy.make_request
-        response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
+        scraper.perform_request = ssrf_proxy.make_request  # type: ignore
+        response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))  # type: ignore
 
     if response.status_code != 200:
         return "URL returned status code {}.".format(response.status_code)
@@ -88,7 +82,7 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
 
     article = extract_using_readabilipy(content)
 
-    if not article.text.strip():
+    if not article.text:
         return ""
 
     res = FULL_TEMPLATE.format(
@@ -104,231 +98,20 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
 class Article:
     title: str
     auther: str
-    text: str
+    text: Sequence[dict]
 
 
 def extract_using_readabilipy(html: str):
-    doc = Document(html)
+    json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True)
     article = Article(
-        title=doc.title(),
-        auther=doc.author(),
-        text=plain_content(
-            readability_content=doc.content(),
-            content_digests=False,
-            node_indexes=False,
-        ),
+        title=json_article.get("title") or "",
+        auther=json_article.get("byline") or "",
+        text=json_article.get("plain_text") or [],
     )
 
     return article
 
 
-def find_module_path(module_name):
-    for package_path in site.getsitepackages():
-        potential_path = os.path.join(package_path, module_name)
-        if os.path.exists(potential_path):
-            return potential_path
-
-    return None
-
-
-@contextmanager
-def chdir(path):
-    """Change directory in context and return to original on exit"""
-    # From https://stackoverflow.com/a/37996581, couldn't find a built-in
-    original_path = os.getcwd()
-    os.chdir(path)
-    try:
-        yield
-    finally:
-        os.chdir(original_path)
-
-
-def extract_text_blocks_as_plain_text(paragraph_html):
-    # Load article as DOM
-    soup = BeautifulSoup(paragraph_html, "html.parser")
-    # Select all lists
-    list_elements = soup.find_all(["ul", "ol"])
-    # Prefix text in all list items with "* " and make lists paragraphs
-    for list_element in list_elements:
-        plain_items = "".join(
-            list(filter(None, [plain_text_leaf_node(li)["text"] for li in list_element.find_all("li")]))
-        )
-        list_element.string = plain_items
-        list_element.name = "p"
-    # Select all text blocks
-    text_blocks = [s.parent for s in soup.find_all(string=True)]
-    text_blocks = [plain_text_leaf_node(block) for block in text_blocks]
-    # Drop empty paragraphs
-    text_blocks = list(filter(lambda p: p["text"] is not None, text_blocks))
-    return text_blocks
-
-
-def plain_text_leaf_node(element):
-    # Extract all text, stripped of any child HTML elements and normalize it
-    plain_text = normalize_text(element.get_text())
-    if plain_text != "" and element.name == "li":
-        plain_text = "* {}, ".format(plain_text)
-    if plain_text == "":
-        plain_text = None
-    if "data-node-index" in element.attrs:
-        plain = {"node_index": element["data-node-index"], "text": plain_text}
-    else:
-        plain = {"text": plain_text}
-    return plain
-
-
-def plain_content(readability_content, content_digests, node_indexes):
-    # Load article as DOM
-    soup = BeautifulSoup(readability_content, "html.parser")
-    # Make all elements plain
-    elements = plain_elements(soup.contents, content_digests, node_indexes)
-    if node_indexes:
-        # Add node index attributes to nodes
-        elements = [add_node_indexes(element) for element in elements]
-    # Replace article contents with plain elements
-    soup.contents = elements
-    return str(soup)
-
-
-def plain_elements(elements, content_digests, node_indexes):
-    # Get plain content versions of all elements
-    elements = [plain_element(element, content_digests, node_indexes) for element in elements]
-    if content_digests:
-        # Add content digest attribute to nodes
-        elements = [add_content_digest(element) for element in elements]
-    return elements
-
-
-def plain_element(element, content_digests, node_indexes):
-    # For lists, we make each item plain text
-    if is_leaf(element):
-        # For leaf node elements, extract the text content, discarding any HTML tags
-        # 1. Get element contents as text
-        plain_text = element.get_text()
-        # 2. Normalize the extracted text string to a canonical representation
-        plain_text = normalize_text(plain_text)
-        # 3. Update element content to be plain text
-        element.string = plain_text
-    elif is_text(element):
-        if is_non_printing(element):
-            # The simplified HTML may have come from Readability.js so might
-            # have non-printing text (e.g. Comment or CData). In this case, we
-            # keep the structure, but ensure that the string is empty.
-            element = type(element)("")
-        else:
-            plain_text = element.string
-            plain_text = normalize_text(plain_text)
-            element = type(element)(plain_text)
-    else:
-        # If not a leaf node or leaf type call recursively on child nodes, replacing
-        element.contents = plain_elements(element.contents, content_digests, node_indexes)
-    return element
-
-
-def add_node_indexes(element, node_index="0"):
-    # Can't add attributes to string types
-    if is_text(element):
-        return element
-    # Add index to current element
-    element["data-node-index"] = node_index
-    # Add index to child elements
-    for local_idx, child in enumerate([c for c in element.contents if not is_text(c)], start=1):
-        # Can't add attributes to leaf string types
-        child_index = "{stem}.{local}".format(stem=node_index, local=local_idx)
-        add_node_indexes(child, node_index=child_index)
-    return element
-
-
-def normalize_text(text):
-    """Normalize unicode and whitespace."""
-    # Normalize unicode first to try and standardize whitespace characters as much as possible before normalizing them
-    text = strip_control_characters(text)
-    text = normalize_unicode(text)
-    text = normalize_whitespace(text)
-    return text
-
-
-def strip_control_characters(text):
-    """Strip out unicode control characters which might break the parsing."""
-    # Unicode control characters
-    #   [Cc]: Other, Control [includes new lines]
-    #   [Cf]: Other, Format
-    #   [Cn]: Other, Not Assigned
-    #   [Co]: Other, Private Use
-    #   [Cs]: Other, Surrogate
-    control_chars = {"Cc", "Cf", "Cn", "Co", "Cs"}
-    retained_chars = ["\t", "\n", "\r", "\f"]
-
-    # Remove non-printing control characters
-    return "".join(
-        [
-            "" if (unicodedata.category(char) in control_chars) and (char not in retained_chars) else char
-            for char in text
-        ]
-    )
-
-
-def normalize_unicode(text):
-    """Normalize unicode such that things that are visually equivalent map to the same unicode string where possible."""
-    normal_form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFKC"
-    text = unicodedata.normalize(normal_form, text)
-    return text
-
-
-def normalize_whitespace(text):
-    """Replace runs of whitespace characters with a single space as this is what happens when HTML text is displayed."""
-    text = regex.sub(r"\s+", " ", text)
-    # Remove leading and trailing whitespace
-    text = text.strip()
-    return text
-
-
-def is_leaf(element):
-    return element.name in {"p", "li"}
-
-
-def is_text(element):
-    return isinstance(element, NavigableString)
-
-
-def is_non_printing(element):
-    return any(isinstance(element, _e) for _e in [Comment, CData])
-
-
-def add_content_digest(element):
-    if not is_text(element):
-        element["data-content-digest"] = content_digest(element)
-    return element
-
-
-def content_digest(element):
-    digest: Any
-    if is_text(element):
-        # Hash
-        trimmed_string = element.string.strip()
-        if trimmed_string == "":
-            digest = ""
-        else:
-            digest = hashlib.sha256(trimmed_string.encode("utf-8")).hexdigest()
-    else:
-        contents = element.contents
-        num_contents = len(contents)
-        if num_contents == 0:
-            # No hash when no child elements exist
-            digest = ""
-        elif num_contents == 1:
-            # If single child, use digest of child
-            digest = content_digest(contents[0])
-        else:
-            # Build content digest from the "non-empty" digests of child nodes
-            digest = hashlib.sha256()
-            child_digests = list(filter(lambda x: x != "", [content_digest(content) for content in contents]))
-            for child in child_digests:
-                digest.update(child.encode("utf-8"))
-            digest = digest.hexdigest()
-    return digest
-
-
 def get_image_upload_file_ids(content):
     pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)"
     matches = re.findall(pattern, content)
diff --git a/api/pyproject.toml b/api/pyproject.toml
index 7f0c2e98b5..1c6adb6587 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -83,7 +83,6 @@ dependencies = [
     "weave~=0.51.0",
     "yarl~=1.18.3",
     "webvtt-py~=0.5.1",
-    "readability-lxml~=0.8.4.1",
 ]
 # Before adding new dependency, consider place it in
 # alphabet order (a-z) and suitable group.
diff --git a/api/uv.lock b/api/uv.lock
index d723bd5936..033dc8762b 100644
--- a/api/uv.lock
+++ b/api/uv.lock
@@ -1151,15 +1151,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 },
 ]
 
-[[package]]
-name = "cssselect"
-version = "1.3.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 },
-]
-
 [[package]]
 name = "dataclasses-json"
 version = "0.6.7"
@@ -1285,7 +1276,6 @@ dependencies = [
     { name = "python-dotenv" },
     { name = "pyyaml" },
     { name = "readabilipy" },
-    { name = "readability-lxml" },
     { name = "redis", extra = ["hiredis"] },
     { name = "resend" },
     { name = "sentry-sdk", extra = ["flask"] },
@@ -1457,7 +1447,6 @@ requires-dist = [
     { name = "python-dotenv", specifier = "==1.0.1" },
     { name = "pyyaml", specifier = "~=6.0.1" },
     { name = "readabilipy", specifier = "~=0.3.0" },
-    { name = "readability-lxml", specifier = "~=0.8.4.1" },
     { name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" },
     { name = "resend", specifier = "~=2.9.0" },
     { name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" },
@@ -2942,23 +2931,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 },
 ]
 
-[package.optional-dependencies]
-html-clean = [
-    { name = "lxml-html-clean" },
-]
-
-[[package]]
-name = "lxml-html-clean"
-version = "0.4.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "lxml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184 },
-]
-
 [[package]]
 name = "lxml-stubs"
 version = "0.5.1"
@@ -4866,20 +4838,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 },
 ]
 
-[[package]]
-name = "readability-lxml"
-version = "0.8.4.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "chardet" },
-    { name = "cssselect" },
-    { name = "lxml", extra = ["html-clean"] },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912 },
-]
-
 [[package]]
 name = "realtime"
 version = "2.4.3"