fix: Refactor web reader to use readability-lxml

Replaces custom readability parsing logic with the readability-lxml library for improved performance and simplicity. This change reduces code complexity by eliminating the need for Readability.js and node.js subprocesses, streamlining the parsing process. Introduces a dataclass for structured article data. Updates dependencies to include readability-lxml and its requirements.

Addresses improved maintainability and performance.

Signed-off-by: -LAN- <laipz8200@outlook.com>
pull/19789/head
-LAN- 1 year ago
parent 55503ce771
commit cbe020be23
No known key found for this signature in database
GPG Key ID: 6BA0D108DED011FF

@ -1,20 +1,18 @@
import hashlib import hashlib
import json
import mimetypes import mimetypes
import os import os
import re import re
import site import site
import subprocess
import tempfile
import unicodedata import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from dataclasses import dataclass
from typing import Any, Literal, Optional, cast from typing import Any, Literal, Optional, cast
from urllib.parse import unquote from urllib.parse import unquote
import chardet import chardet
import cloudscraper # type: ignore import cloudscraper # type: ignore
from bs4 import BeautifulSoup, CData, Comment, NavigableString # type: ignore from bs4 import BeautifulSoup, CData, Comment, NavigableString # type: ignore
from readability import Document # type: ignore
from regex import regex # type: ignore from regex import regex # type: ignore
from core.helper import ssrf_proxy from core.helper import ssrf_proxy
@ -23,9 +21,7 @@ from core.rag.extractor.extract_processor import ExtractProcessor
FULL_TEMPLATE = """ FULL_TEMPLATE = """
TITLE: {title} TITLE: {title}
AUTHORS: {authors} AUTHOR: {author}
PUBLISH DATE: {publish_date}
TOP_IMAGE_URL: {top_image}
TEXT: TEXT:
{text} {text}
@ -90,66 +86,40 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
else: else:
content = response.text content = response.text
a = extract_using_readabilipy(content) article = extract_using_readabilipy(content)
if not a["plain_text"] or not a["plain_text"].strip(): if not article.text.strip():
return "" return ""
res = FULL_TEMPLATE.format( res = FULL_TEMPLATE.format(
title=a["title"], title=article.title,
authors=a["byline"], author=article.auther,
publish_date=a["date"], text=article.text,
top_image="",
text=a["plain_text"] or "",
) )
return res return res
def extract_using_readabilipy(html): @dataclass
with tempfile.NamedTemporaryFile(delete=False, mode="w+") as f_html: class Article:
f_html.write(html) title: str
f_html.close() auther: str
html_path = f_html.name text: str
# Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
article_json_path = html_path + ".json" def extract_using_readabilipy(html: str):
jsdir = os.path.join(find_module_path("readabilipy"), "javascript") doc = Document(html)
with chdir(jsdir): article = Article(
subprocess.check_call(["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path]) title=doc.title(),
auther=doc.author(),
# Read output of call to Readability.parse() from JSON file and return as Python dictionary text=plain_content(
input_json = json.loads(Path(article_json_path).read_text(encoding="utf-8")) readability_content=doc.content(),
content_digests=False,
# Deleting files after processing node_indexes=False,
os.unlink(article_json_path) ),
os.unlink(html_path) )
article_json: dict[str, Any] = { return article
"title": None,
"byline": None,
"date": None,
"content": None,
"plain_content": None,
"plain_text": None,
}
# Populate article fields from readability fields where present
if input_json:
if input_json.get("title"):
article_json["title"] = input_json["title"]
if input_json.get("byline"):
article_json["byline"] = input_json["byline"]
if input_json.get("date"):
article_json["date"] = input_json["date"]
if input_json.get("content"):
article_json["content"] = input_json["content"]
article_json["plain_content"] = plain_content(article_json["content"], False, False)
article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
if input_json.get("textContent"):
article_json["plain_text"] = input_json["textContent"]
article_json["plain_text"] = re.sub(r"\n\s*\n", "\n", article_json["plain_text"])
return article_json
def find_module_path(module_name): def find_module_path(module_name):

@ -83,6 +83,7 @@ dependencies = [
"weave~=0.51.0", "weave~=0.51.0",
"yarl~=1.18.3", "yarl~=1.18.3",
"webvtt-py~=0.5.1", "webvtt-py~=0.5.1",
"readability-lxml~=0.8.4.1",
] ]
# Before adding new dependency, consider place it in # Before adding new dependency, consider place it in
# alphabet order (a-z) and suitable group. # alphabet order (a-z) and suitable group.

@ -1151,6 +1151,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 }, { url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 },
] ]
[[package]]
name = "cssselect"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 },
]
[[package]] [[package]]
name = "dataclasses-json" name = "dataclasses-json"
version = "0.6.7" version = "0.6.7"
@ -1276,6 +1285,7 @@ dependencies = [
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "pyyaml" }, { name = "pyyaml" },
{ name = "readabilipy" }, { name = "readabilipy" },
{ name = "readability-lxml" },
{ name = "redis", extra = ["hiredis"] }, { name = "redis", extra = ["hiredis"] },
{ name = "resend" }, { name = "resend" },
{ name = "sentry-sdk", extra = ["flask"] }, { name = "sentry-sdk", extra = ["flask"] },
@ -1447,6 +1457,7 @@ requires-dist = [
{ name = "python-dotenv", specifier = "==1.0.1" }, { name = "python-dotenv", specifier = "==1.0.1" },
{ name = "pyyaml", specifier = "~=6.0.1" }, { name = "pyyaml", specifier = "~=6.0.1" },
{ name = "readabilipy", specifier = "~=0.3.0" }, { name = "readabilipy", specifier = "~=0.3.0" },
{ name = "readability-lxml", specifier = "~=0.8.4.1" },
{ name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" }, { name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" },
{ name = "resend", specifier = "~=2.9.0" }, { name = "resend", specifier = "~=2.9.0" },
{ name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" }, { name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" },
@ -2931,6 +2942,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 }, { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 },
] ]
[package.optional-dependencies]
html-clean = [
{ name = "lxml-html-clean" },
]
[[package]]
name = "lxml-html-clean"
version = "0.4.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "lxml" },
]
sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184 },
]
[[package]] [[package]]
name = "lxml-stubs" name = "lxml-stubs"
version = "0.5.1" version = "0.5.1"
@ -4838,6 +4866,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 }, { url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 },
] ]
[[package]]
name = "readability-lxml"
version = "0.8.4.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "chardet" },
{ name = "cssselect" },
{ name = "lxml", extra = ["html-clean"] },
]
sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912 },
]
[[package]] [[package]]
name = "realtime" name = "realtime"
version = "2.4.3" version = "2.4.3"

Loading…
Cancel
Save