fix: Refactor web reader to use readability-lxml

Replaces custom readability parsing logic with the readability-lxml library for improved performance and simplicity. This change reduces code complexity by eliminating the need for Readability.js and node.js subprocesses, streamlining the parsing process. Introduces a dataclass for structured article data. Updates dependencies to include readability-lxml and its requirements.

Addresses improved maintainability and performance.

Signed-off-by: -LAN- <laipz8200@outlook.com>
pull/19789/head
-LAN- 1 year ago
parent 55503ce771
commit cbe020be23
No known key found for this signature in database
GPG Key ID: 6BA0D108DED011FF

@ -1,20 +1,18 @@
import hashlib
import json
import mimetypes
import os
import re
import site
import subprocess
import tempfile
import unicodedata
from contextlib import contextmanager
from pathlib import Path
from dataclasses import dataclass
from typing import Any, Literal, Optional, cast
from urllib.parse import unquote
import chardet
import cloudscraper # type: ignore
from bs4 import BeautifulSoup, CData, Comment, NavigableString # type: ignore
from readability import Document # type: ignore
from regex import regex # type: ignore
from core.helper import ssrf_proxy
@ -23,9 +21,7 @@ from core.rag.extractor.extract_processor import ExtractProcessor
FULL_TEMPLATE = """
TITLE: {title}
AUTHORS: {authors}
PUBLISH DATE: {publish_date}
TOP_IMAGE_URL: {top_image}
AUTHOR: {author}
TEXT:
{text}
@ -90,66 +86,40 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
else:
content = response.text
a = extract_using_readabilipy(content)
article = extract_using_readabilipy(content)
if not a["plain_text"] or not a["plain_text"].strip():
if not article.text.strip():
return ""
res = FULL_TEMPLATE.format(
title=a["title"],
authors=a["byline"],
publish_date=a["date"],
top_image="",
text=a["plain_text"] or "",
title=article.title,
author=article.auther,
text=article.text,
)
return res
def extract_using_readabilipy(html):
with tempfile.NamedTemporaryFile(delete=False, mode="w+") as f_html:
f_html.write(html)
f_html.close()
html_path = f_html.name
# Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
article_json_path = html_path + ".json"
jsdir = os.path.join(find_module_path("readabilipy"), "javascript")
with chdir(jsdir):
subprocess.check_call(["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path])
# Read output of call to Readability.parse() from JSON file and return as Python dictionary
input_json = json.loads(Path(article_json_path).read_text(encoding="utf-8"))
# Deleting files after processing
os.unlink(article_json_path)
os.unlink(html_path)
article_json: dict[str, Any] = {
"title": None,
"byline": None,
"date": None,
"content": None,
"plain_content": None,
"plain_text": None,
}
# Populate article fields from readability fields where present
if input_json:
if input_json.get("title"):
article_json["title"] = input_json["title"]
if input_json.get("byline"):
article_json["byline"] = input_json["byline"]
if input_json.get("date"):
article_json["date"] = input_json["date"]
if input_json.get("content"):
article_json["content"] = input_json["content"]
article_json["plain_content"] = plain_content(article_json["content"], False, False)
article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
if input_json.get("textContent"):
article_json["plain_text"] = input_json["textContent"]
article_json["plain_text"] = re.sub(r"\n\s*\n", "\n", article_json["plain_text"])
return article_json
@dataclass
class Article:
title: str
auther: str
text: str
def extract_using_readabilipy(html: str):
doc = Document(html)
article = Article(
title=doc.title(),
auther=doc.author(),
text=plain_content(
readability_content=doc.content(),
content_digests=False,
node_indexes=False,
),
)
return article
def find_module_path(module_name):

@ -83,6 +83,7 @@ dependencies = [
"weave~=0.51.0",
"yarl~=1.18.3",
"webvtt-py~=0.5.1",
"readability-lxml~=0.8.4.1",
]
# Before adding new dependency, consider place it in
# alphabet order (a-z) and suitable group.

@ -1151,6 +1151,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/87/62/d69eb4a8ee231f4bf733a92caf9da13f1c81a44e874b1d4080c25ecbb723/cryptography-44.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:5d20cc348cca3a8aa7312f42ab953a56e15323800ca3ab0706b8cd452a3a056c", size = 3134369 },
]
[[package]]
name = "cssselect"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 },
]
[[package]]
name = "dataclasses-json"
version = "0.6.7"
@ -1276,6 +1285,7 @@ dependencies = [
{ name = "python-dotenv" },
{ name = "pyyaml" },
{ name = "readabilipy" },
{ name = "readability-lxml" },
{ name = "redis", extra = ["hiredis"] },
{ name = "resend" },
{ name = "sentry-sdk", extra = ["flask"] },
@ -1447,6 +1457,7 @@ requires-dist = [
{ name = "python-dotenv", specifier = "==1.0.1" },
{ name = "pyyaml", specifier = "~=6.0.1" },
{ name = "readabilipy", specifier = "~=0.3.0" },
{ name = "readability-lxml", specifier = "~=0.8.4.1" },
{ name = "redis", extras = ["hiredis"], specifier = "~=6.1.0" },
{ name = "resend", specifier = "~=2.9.0" },
{ name = "sentry-sdk", extras = ["flask"], specifier = "~=2.28.0" },
@ -2931,6 +2942,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095 },
]
[package.optional-dependencies]
html-clean = [
{ name = "lxml-html-clean" },
]
[[package]]
name = "lxml-html-clean"
version = "0.4.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "lxml" },
]
sdist = { url = "https://files.pythonhosted.org/packages/79/b6/466e71db127950fb8d172026a8f0a9f0dc6f64c8e78e2ca79f252e5790b8/lxml_html_clean-0.4.2.tar.gz", hash = "sha256:91291e7b5db95430abf461bc53440964d58e06cc468950f9e47db64976cebcb3", size = 21622 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/0b/942cb7278d6caad79343ad2ddd636ed204a47909b969d19114a3097f5aa3/lxml_html_clean-0.4.2-py3-none-any.whl", hash = "sha256:74ccfba277adcfea87a1e9294f47dd86b05d65b4da7c5b07966e3d5f3be8a505", size = 14184 },
]
[[package]]
name = "lxml-stubs"
version = "0.5.1"
@ -4838,6 +4866,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dd/46/8a640c6de1a6c6af971f858b2fb178ca5e1db91f223d8ba5f40efe1491e5/readabilipy-0.3.0-py3-none-any.whl", hash = "sha256:d106da0fad11d5fdfcde21f5c5385556bfa8ff0258483037d39ea6b1d6db3943", size = 22158 },
]
[[package]]
name = "readability-lxml"
version = "0.8.4.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "chardet" },
{ name = "cssselect" },
{ name = "lxml", extra = ["html-clean"] },
]
sdist = { url = "https://files.pythonhosted.org/packages/55/3e/dc87d97532ddad58af786ec89c7036182e352574c1cba37bf2bf783d2b15/readability_lxml-0.8.4.1.tar.gz", hash = "sha256:9d2924f5942dd7f37fb4da353263b22a3e877ccf922d0e45e348e4177b035a53", size = 22874 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/75/2cc58965097e351415af420be81c4665cf80da52a17ef43c01ffbe2caf91/readability_lxml-0.8.4.1-py3-none-any.whl", hash = "sha256:874c0cea22c3bf2b78c7f8df831bfaad3c0a89b7301d45a188db581652b4b465", size = 19912 },
]
[[package]]
name = "realtime"
version = "2.4.3"

Loading…
Cancel
Save