fix: Refactor web reader to use readability-lxml

Replaces custom readability parsing logic with the readability-lxml library for improved performance and simplicity. This change reduces code complexity by eliminating the need for Readability.js and node.js subprocesses, streamlining the parsing process. Introduces a dataclass for structured article data. Updates dependencies to include readability-lxml and its requirements. Addresses improved maintainability and performance. Signed-off-by: -LAN- <laipz8200@outlook.com>
1 year ago · cbe020be23
parent 55503ce771
commit cbe020be23
3 changed files with 71 additions and 58 deletions
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@ -1,20 +1,18 @@
 import hashlib
-import json
 import mimetypes
 import os
 import re
 import site
-import subprocess
-import tempfile
 import unicodedata
 from contextlib import contextmanager
-from pathlib import Path
+from dataclasses import dataclass
 from typing import Any, Literal, Optional, cast
 from urllib.parse import unquote

 import chardet
 import cloudscraper  # type: ignore
 from bs4 import BeautifulSoup, CData, Comment, NavigableString  # type: ignore
+from readability import Document  # type: ignore
 from regex import regex  # type: ignore

 from core.helper import ssrf_proxy
@ -23,9 +21,7 @@ from core.rag.extractor.extract_processor import ExtractProcessor

 FULL_TEMPLATE = """
 TITLE: {title}
-AUTHORS: {authors}
-PUBLISH DATE: {publish_date}
-TOP_IMAGE_URL: {top_image}
+AUTHOR: {author}
 TEXT:

 {text}
@ -90,66 +86,40 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
    else:
        content = response.text

-    a = extract_using_readabilipy(content)
+    article = extract_using_readabilipy(content)

-    if not a["plain_text"] or not a["plain_text"].strip():
+    if not article.text.strip():
        return ""

    res = FULL_TEMPLATE.format(
-        title=a["title"],
-        authors=a["byline"],
-        publish_date=a["date"],
-        top_image="",
-        text=a["plain_text"] or "",
+        title=article.title,
+        author=article.auther,
+        text=article.text,
    )

    return res


-def extract_using_readabilipy(html):
-    with tempfile.NamedTemporaryFile(delete=False, mode="w+") as f_html:
-        f_html.write(html)
-        f_html.close()
-    html_path = f_html.name
-
-    # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
-    article_json_path = html_path + ".json"
-    jsdir = os.path.join(find_module_path("readabilipy"), "javascript")
-    with chdir(jsdir):
-        subprocess.check_call(["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path])
-
-    # Read output of call to Readability.parse() from JSON file and return as Python dictionary
-    input_json = json.loads(Path(article_json_path).read_text(encoding="utf-8"))
-
-    # Deleting files after processing
-    os.unlink(article_json_path)
-    os.unlink(html_path)
-
-    article_json: dict[str, Any] = {
-        "title": None,
-        "byline": None,
-        "date": None,
-        "content": None,
-        "plain_content": None,
-        "plain_text": None,
-    }
-    # Populate article fields from readability fields where present
-    if input_json:
-        if input_json.get("title"):
-            article_json["title"] = input_json["title"]
-        if input_json.get("byline"):
-            article_json["byline"] = input_json["byline"]
-        if input_json.get("date"):
-            article_json["date"] = input_json["date"]
-        if input_json.get("content"):
-            article_json["content"] = input_json["content"]
-            article_json["plain_content"] = plain_content(article_json["content"], False, False)
-            article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
-        if input_json.get("textContent"):
-            article_json["plain_text"] = input_json["textContent"]
-            article_json["plain_text"] = re.sub(r"\n\s*\n", "\n", article_json["plain_text"])
-
-    return article_json
+@dataclass
+class Article:
+    title: str
+    auther: str
+    text: str
+
+
+def extract_using_readabilipy(html: str):
+    doc = Document(html)
+    article = Article(
+        title=doc.title(),
+        auther=doc.author(),
+        text=plain_content(
+            readability_content=doc.content(),
+            content_digests=False,
+            node_indexes=False,
+        ),
+    )
+
+    return article


 def find_module_path(module_name):
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@ -83,6 +83,7 @@ dependencies = [
    "weave~=0.51.0",
    "yarl~=1.18.3",
    "webvtt-py~=0.5.1",
+    "readability-lxml~=0.8.4.1",
 ]
 # Before adding new dependency, consider place it in
 # alphabet order (a-z) and suitable group.
--- a/api/uv.lock
+++ b/api/uv.lock
@ -1151,6 +1151,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 },
 ]

+[[package]]
+name = "cssselect"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 },
+]
+
 [[package]]
 name = "dataclasses-json"
 version = "0.6.7"
@ -1276,6 +1285,7 @@ dependencies = [
    { name = "python-dotenv" },
    { name = "pyyaml" },
    { name = "readabilipy" },
+    { name = "readability-lxml" },
    { name = "redis", extra = ["hiredis"] },
    { name = "resend" },
    { name = "sentry-sdk", extra = ["flask"] },
@ -1447,6 +1457,7 @@ requires-dist = [
    { name = "python-dotenv", specifier = "==1.0.1" },
    { name = "pyyaml", specifier = "~=6.0.1" },
    { name = "readabilipy", specifier = "~=0.3.0" },
+    { name = "readability-lxml", specifier = "~=0.8.4.1" },
    { name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" },
    { name = "resend", specifier = "~=2.9.0" },
    { name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" },
@ -2931,6 +2942,23 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 },
 ]

+[package.optional-dependencies]
+html-clean = [
+    { name = "lxml-html-clean" },
+]
+
+[[package]]
+name = "lxml-html-clean"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184 },
+]
+
 [[package]]
 name = "lxml-stubs"
 version = "0.5.1"
@ -4838,6 +4866,20 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 },
 ]

+[[package]]
+name = "readability-lxml"
+version = "0.8.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "chardet" },
+    { name = "cssselect" },
+    { name = "lxml", extra = ["html-clean"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912 },
+]
+
 [[package]]
 name = "realtime"
 version = "2.4.3"